diff mbox series

[v2,09/11] vpci: register as an internal ioreq server

Message ID 20190903161428.7159-10-roger.pau@citrix.com (mailing list archive)
State Superseded
Headers show
Series ioreq: add support for internal servers | expand

Commit Message

Roger Pau Monné Sept. 3, 2019, 4:14 p.m. UTC
Switch vPCI to become an internal ioreq server, and hence drop all the
vPCI specific decoding and trapping to PCI IO ports and MMCFG regions.

This allows to unify the vPCI code with the ioreq infrastructure,
opening the door for domains having PCI accesses handled by vPCI and
other ioreq servers at the same time.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
 - Remove prototypes for register_vpci_portio_handler and
   register_vpci_mmcfg_handler.
 - Re-add vpci check in hwdom_iommu_map.
 - Fix test harness.
 - Remove vpci_{read/write} prototypes and make the functions static.
---
 tools/tests/vpci/Makefile     |   5 +-
 tools/tests/vpci/emul.h       |   4 +
 xen/arch/x86/hvm/dom0_build.c |   1 +
 xen/arch/x86/hvm/hvm.c        |   5 +-
 xen/arch/x86/hvm/io.c         | 201 ----------------------------------
 xen/arch/x86/physdev.c        |   1 +
 xen/drivers/vpci/vpci.c       |  69 +++++++++++-
 xen/include/xen/vpci.h        |  22 +---
 8 files changed, 86 insertions(+), 222 deletions(-)

Comments

Paul Durrant Sept. 10, 2019, 1:49 p.m. UTC | #1
> -----Original Message-----
> From: Roger Pau Monne <roger.pau@citrix.com>
> Sent: 03 September 2019 17:14
> To: xen-devel@lists.xenproject.org
> Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan
> Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>;
> Paul Durrant <Paul.Durrant@citrix.com>
> Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> 
> Switch vPCI to become an internal ioreq server, and hence drop all the
> vPCI specific decoding and trapping to PCI IO ports and MMCFG regions.
> 
> This allows to unify the vPCI code with the ioreq infrastructure,
> opening the door for domains having PCI accesses handled by vPCI and
> other ioreq servers at the same time.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

[snip]
> diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
> index f61f66df5f..bf2c64a0a9 100644
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -11,6 +11,7 @@
>  #include <asm/current.h>
>  #include <asm/io_apic.h>
>  #include <asm/msi.h>
> +#include <asm/hvm/ioreq.h>

Why is this change necessary on its own?

>  #include <asm/hvm/irq.h>
>  #include <asm/hypercall.h>
>  #include <public/xen.h>
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index cbd1bac7fc..5664020c2d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -20,6 +20,8 @@
>  #include <xen/sched.h>
>  #include <xen/vpci.h>
> 
> +#include <asm/hvm/ioreq.h>
> +
>  /* Internal struct to store the emulated PCI registers. */
>  struct vpci_register {
>      vpci_read_t *read;
> @@ -302,7 +304,7 @@ static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
>      return (data & ~(mask << (offset * 8))) | ((new & mask) << (offset * 8));
>  }
> 
> -uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
> +static uint32_t read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>  {
>      const struct domain *d = current->domain;
>      const struct pci_dev *pdev;
> @@ -404,8 +406,8 @@ static void vpci_write_helper(const struct pci_dev *pdev,
>               r->private);
>  }
> 
> -void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> -                uint32_t data)
> +static void write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> +                  uint32_t data)
>  {
>      const struct domain *d = current->domain;
>      const struct pci_dev *pdev;
> @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>      spin_unlock(&pdev->vpci->lock);
>  }
> 
> +#ifdef __XEN__
> +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> +{
> +    pci_sbdf_t sbdf;
> +
> +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> +        /*
> +         * Ignore invalidate requests, those can be received even without
> +         * having any memory ranges registered, see send_invalidate_req.
> +         */
> +        return X86EMUL_OKAY;

In general, I wonder whether internal servers will ever need to deal with invalidate? The code only exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get dropped.

  Paul

> +
> +    if ( req->type != IOREQ_TYPE_PCI_CONFIG || req->data_is_ptr )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    sbdf.sbdf = req->addr >> 32;
> +
> +    if ( req->dir )
> +        req->data = read(sbdf, req->addr, req->size);
> +    else
> +        write(sbdf, req->addr, req->size, req->data);
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +int vpci_register_ioreq(struct domain *d)
> +{
> +    ioservid_t id;
> +    int rc;
> +
> +    if ( !has_vpci(d) )
> +        return 0;
> +
> +    rc = hvm_create_ioreq_server(d, HVM_IOREQSRV_BUFIOREQ_OFF, &id, true);
> +    if ( rc )
> +        return rc;
> +
> +    rc = hvm_add_ioreq_handler(d, id, ioreq_handler, NULL);
> +    if ( rc )
> +        return rc;
> +
> +    if ( is_hardware_domain(d) )
> +    {
> +        /* Handle all devices in vpci. */
> +        rc = hvm_map_io_range_to_ioreq_server(d, id, XEN_DMOP_IO_RANGE_PCI,
> +                                              0, ~(uint64_t)0);
> +        if ( rc )
> +            return rc;
> +    }
> +
> +    rc = hvm_set_ioreq_server_state(d, id, true);
> +    if ( rc )
> +        return rc;
> +
> +    return rc;
> +}
> +#endif
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 4cf233c779..36f435ed5b 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -23,6 +23,9 @@ typedef int vpci_register_init_t(struct pci_dev *dev);
>    static vpci_register_init_t *const x##_entry  \
>                 __used_section(".data.vpci." p) = x
> 
> +/* Register vPCI handler with ioreq. */
> +int vpci_register_ioreq(struct domain *d);
> +
>  /* Add vPCI handlers to device. */
>  int __must_check vpci_add_handlers(struct pci_dev *dev);
> 
> @@ -38,11 +41,6 @@ int __must_check vpci_add_register(struct vpci *vpci,
>  int __must_check vpci_remove_register(struct vpci *vpci, unsigned int offset,
>                                        unsigned int size);
> 
> -/* Generic read/write handlers for the PCI config space. */
> -uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
> -void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> -                uint32_t data);
> -
>  /* Passthrough handlers. */
>  uint32_t vpci_hw_read16(const struct pci_dev *pdev, unsigned int reg,
>                          void *data);
> @@ -221,20 +219,12 @@ static inline int vpci_add_handlers(struct pci_dev *pdev)
>      return 0;
>  }
> 
> -static inline void vpci_dump_msi(void) { }
> -
> -static inline uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg,
> -                                 unsigned int size)
> +static inline int vpci_register_ioreq(struct domain *d)
>  {
> -    ASSERT_UNREACHABLE();
> -    return ~(uint32_t)0;
> +    return 0;
>  }
> 
> -static inline void vpci_write(pci_sbdf_t sbdf, unsigned int reg,
> -                              unsigned int size, uint32_t data)
> -{
> -    ASSERT_UNREACHABLE();
> -}
> +static inline void vpci_dump_msi(void) { }
> 
>  static inline bool vpci_process_pending(struct vcpu *v)
>  {
> --
> 2.22.0
Roger Pau Monné Sept. 26, 2019, 3:07 p.m. UTC | #2
On Tue, Sep 10, 2019 at 03:49:41PM +0200, Paul Durrant wrote:
> > -----Original Message-----
> > From: Roger Pau Monne <roger.pau@citrix.com>
> > Sent: 03 September 2019 17:14
> > To: xen-devel@lists.xenproject.org
> > Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> > <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan
> > Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>;
> > Paul Durrant <Paul.Durrant@citrix.com>
> > Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> >      spin_unlock(&pdev->vpci->lock);
> >  }
> > 
> > +#ifdef __XEN__
> > +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> > +{
> > +    pci_sbdf_t sbdf;
> > +
> > +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> > +        /*
> > +         * Ignore invalidate requests, those can be received even without
> > +         * having any memory ranges registered, see send_invalidate_req.
> > +         */
> > +        return X86EMUL_OKAY;
> 
> In general, I wonder whether internal servers will ever need to deal with invalidate? The code only exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get dropped.

I think the best solution here is to rename hvm_broadcast_ioreq to
hvm_broadcast_ioreq_external and switch it's callers. Both
send_timeoffset_req and send_invalidate_req seem only relevant to
external ioreq servers.

Thanks, Roger.
Paul Durrant Sept. 27, 2019, 8:29 a.m. UTC | #3
> -----Original Message-----
> From: Roger Pau Monne <roger.pau@citrix.com>
> Sent: 26 September 2019 16:07
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> 
> On Tue, Sep 10, 2019 at 03:49:41PM +0200, Paul Durrant wrote:
> > > -----Original Message-----
> > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > Sent: 03 September 2019 17:14
> > > To: xen-devel@lists.xenproject.org
> > > Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> > > <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>;
> Jan
> > > Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org)
> <tim@xen.org>;
> > > Paul Durrant <Paul.Durrant@citrix.com>
> > > Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > > @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> > >      spin_unlock(&pdev->vpci->lock);
> > >  }
> > >
> > > +#ifdef __XEN__
> > > +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> > > +{
> > > +    pci_sbdf_t sbdf;
> > > +
> > > +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> > > +        /*
> > > +         * Ignore invalidate requests, those can be received even without
> > > +         * having any memory ranges registered, see send_invalidate_req.
> > > +         */
> > > +        return X86EMUL_OKAY;
> >
> > In general, I wonder whether internal servers will ever need to deal with invalidate? The code only
> exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get
> dropped.
> 
> I think the best solution here is to rename hvm_broadcast_ioreq to
> hvm_broadcast_ioreq_external and switch it's callers. Both
> send_timeoffset_req and send_invalidate_req seem only relevant to
> external ioreq servers.

send_timeoffset_req() is relic which ought to be replaced with another mechanism IMO...

When an HVM guest writes its RTC, a new 'timeoffset' value (offset of RTC from host time) is calculated (also applied to the PV wallclock) and advertised via this ioreq. In XenServer, this is picked up by QEMU, forwarded via QMP to XAPI and then written into the VM meta-data (which than causes it to be written into xenstore too). All this is so that that guest's RTC can be set correctly when it is rebooted... There has to be a better way (e.g. extracting RTC via hvm context and saving it before cleaning up the domain).

send_invalidate_req() is relevant for any emulator maintaining a cache of guest->host memory mappings which, I guess, could include internal emulators even if this is not the case at the moment.

  Paul

> 
> Thanks, Roger.
Roger Pau Monné Sept. 27, 2019, 8:45 a.m. UTC | #4
On Fri, Sep 27, 2019 at 10:29:21AM +0200, Paul Durrant wrote:
> > -----Original Message-----
> > From: Roger Pau Monne <roger.pau@citrix.com>
> > Sent: 26 September 2019 16:07
> > To: Paul Durrant <Paul.Durrant@citrix.com>
> > Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>; Andrew
> > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> > <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> > Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > 
> > On Tue, Sep 10, 2019 at 03:49:41PM +0200, Paul Durrant wrote:
> > > > -----Original Message-----
> > > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > > Sent: 03 September 2019 17:14
> > > > To: xen-devel@lists.xenproject.org
> > > > Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> > > > <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>;
> > Jan
> > > > Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org)
> > <tim@xen.org>;
> > > > Paul Durrant <Paul.Durrant@citrix.com>
> > > > Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > > > @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> > > >      spin_unlock(&pdev->vpci->lock);
> > > >  }
> > > >
> > > > +#ifdef __XEN__
> > > > +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> > > > +{
> > > > +    pci_sbdf_t sbdf;
> > > > +
> > > > +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> > > > +        /*
> > > > +         * Ignore invalidate requests, those can be received even without
> > > > +         * having any memory ranges registered, see send_invalidate_req.
> > > > +         */
> > > > +        return X86EMUL_OKAY;
> > >
> > > In general, I wonder whether internal servers will ever need to deal with invalidate? The code only
> > exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get
> > dropped.
> > 
> > I think the best solution here is to rename hvm_broadcast_ioreq to
> > hvm_broadcast_ioreq_external and switch it's callers. Both
> > send_timeoffset_req and send_invalidate_req seem only relevant to
> > external ioreq servers.
> 
> send_timeoffset_req() is relic which ought to be replaced with another mechanism IMO...
> 
> When an HVM guest writes its RTC, a new 'timeoffset' value (offset of RTC from host time) is calculated (also applied to the PV wallclock) and advertised via this ioreq. In XenServer, this is picked up by QEMU, forwarded via QMP to XAPI and then written into the VM meta-data (which than causes it to be written into xenstore too). All this is so that that guest's RTC can be set correctly when it is rebooted... There has to be a better way (e.g. extracting RTC via hvm context and saving it before cleaning up the domain).
> 
> send_invalidate_req() is relevant for any emulator maintaining a cache of guest->host memory mappings which, I guess, could include internal emulators even if this is not the case at the moment.

Maybe, but I would expect an internal emulator to get a reference on
the gfn if it does need to keep it in some kind of cache, or else I
don't think code in the hypervisor should be keeping such references.
IMO I would start by not forwarding invalidate requests to internal
emulators. We can always change this in the future if we come up with
a use-cases that needs it.

Thanks, Roger.
Paul Durrant Sept. 27, 2019, 9:01 a.m. UTC | #5
> -----Original Message-----
> From: Roger Pau Monne <roger.pau@citrix.com>
> Sent: 27 September 2019 09:46
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> 
> On Fri, Sep 27, 2019 at 10:29:21AM +0200, Paul Durrant wrote:
> > > -----Original Message-----
> > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > Sent: 26 September 2019 16:07
> > > To: Paul Durrant <Paul.Durrant@citrix.com>
> > > Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>;
> Andrew
> > > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> > > <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> > > Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > >
> > > On Tue, Sep 10, 2019 at 03:49:41PM +0200, Paul Durrant wrote:
> > > > > -----Original Message-----
> > > > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > > > Sent: 03 September 2019 17:14
> > > > > To: xen-devel@lists.xenproject.org
> > > > > Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> > > > > <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap
> <George.Dunlap@citrix.com>;
> > > Jan
> > > > > Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org)
> > > <tim@xen.org>;
> > > > > Paul Durrant <Paul.Durrant@citrix.com>
> > > > > Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > > > > @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> > > > >      spin_unlock(&pdev->vpci->lock);
> > > > >  }
> > > > >
> > > > > +#ifdef __XEN__
> > > > > +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> > > > > +{
> > > > > +    pci_sbdf_t sbdf;
> > > > > +
> > > > > +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> > > > > +        /*
> > > > > +         * Ignore invalidate requests, those can be received even without
> > > > > +         * having any memory ranges registered, see send_invalidate_req.
> > > > > +         */
> > > > > +        return X86EMUL_OKAY;
> > > >
> > > > In general, I wonder whether internal servers will ever need to deal with invalidate? The code
> only
> > > exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get
> > > dropped.
> > >
> > > I think the best solution here is to rename hvm_broadcast_ioreq to
> > > hvm_broadcast_ioreq_external and switch it's callers. Both
> > > send_timeoffset_req and send_invalidate_req seem only relevant to
> > > external ioreq servers.
> >
> > send_timeoffset_req() is relic which ought to be replaced with another mechanism IMO...
> >
> > When an HVM guest writes its RTC, a new 'timeoffset' value (offset of RTC from host time) is
> calculated (also applied to the PV wallclock) and advertised via this ioreq. In XenServer, this is
> picked up by QEMU, forwarded via QMP to XAPI and then written into the VM meta-data (which than causes
> it to be written into xenstore too). All this is so that that guest's RTC can be set correctly when it
> is rebooted... There has to be a better way (e.g. extracting RTC via hvm context and saving it before
> cleaning up the domain).
> >
> > send_invalidate_req() is relevant for any emulator maintaining a cache of guest->host memory
> mappings which, I guess, could include internal emulators even if this is not the case at the moment.
> 
> Maybe, but I would expect an internal emulator to get a reference on
> the gfn if it does need to keep it in some kind of cache, or else I
> don't think code in the hypervisor should be keeping such references.

Oh indeed, but that's not the issue. The issue is when to drop those refs... If the guest does a decrease_reservation on a gfn cached by the emulator then the emulator needs to drop its ref to allow the page to be freed.

  Paul

> IMO I would start by not forwarding invalidate requests to internal
> emulators. We can always change this in the future if we come up with
> a use-cases that needs it.
> 
> Thanks, Roger.
Roger Pau Monné Sept. 27, 2019, 10:46 a.m. UTC | #6
On Fri, Sep 27, 2019 at 11:01:39AM +0200, Paul Durrant wrote:
> > -----Original Message-----
> > From: Roger Pau Monne <roger.pau@citrix.com>
> > Sent: 27 September 2019 09:46
> > To: Paul Durrant <Paul.Durrant@citrix.com>
> > Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>; Andrew
> > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> > <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> > Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > 
> > On Fri, Sep 27, 2019 at 10:29:21AM +0200, Paul Durrant wrote:
> > > > -----Original Message-----
> > > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > > Sent: 26 September 2019 16:07
> > > > To: Paul Durrant <Paul.Durrant@citrix.com>
> > > > Cc: xen-devel@lists.xenproject.org; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu <wl@xen.org>;
> > Andrew
> > > > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Jan Beulich
> > > > <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org) <tim@xen.org>
> > > > Subject: Re: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > > >
> > > > On Tue, Sep 10, 2019 at 03:49:41PM +0200, Paul Durrant wrote:
> > > > > > -----Original Message-----
> > > > > > From: Roger Pau Monne <roger.pau@citrix.com>
> > > > > > Sent: 03 September 2019 17:14
> > > > > > To: xen-devel@lists.xenproject.org
> > > > > > Cc: Roger Pau Monne <roger.pau@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Wei Liu
> > > > > > <wl@xen.org>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap
> > <George.Dunlap@citrix.com>;
> > > > Jan
> > > > > > Beulich <jbeulich@suse.com>; Julien Grall <julien.grall@arm.com>; Konrad Rzeszutek Wilk
> > > > > > <konrad.wilk@oracle.com>; Stefano Stabellini <sstabellini@kernel.org>; Tim (Xen.org)
> > > > <tim@xen.org>;
> > > > > > Paul Durrant <Paul.Durrant@citrix.com>
> > > > > > Subject: [PATCH v2 09/11] vpci: register as an internal ioreq server
> > > > > > @@ -478,6 +480,67 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> > > > > >      spin_unlock(&pdev->vpci->lock);
> > > > > >  }
> > > > > >
> > > > > > +#ifdef __XEN__
> > > > > > +static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
> > > > > > +{
> > > > > > +    pci_sbdf_t sbdf;
> > > > > > +
> > > > > > +    if ( req->type == IOREQ_TYPE_INVALIDATE )
> > > > > > +        /*
> > > > > > +         * Ignore invalidate requests, those can be received even without
> > > > > > +         * having any memory ranges registered, see send_invalidate_req.
> > > > > > +         */
> > > > > > +        return X86EMUL_OKAY;
> > > > >
> > > > > In general, I wonder whether internal servers will ever need to deal with invalidate? The code
> > only
> > > > exists to get QEMU to drop its map cache after a decrease_reservation so that the page refs get
> > > > dropped.
> > > >
> > > > I think the best solution here is to rename hvm_broadcast_ioreq to
> > > > hvm_broadcast_ioreq_external and switch it's callers. Both
> > > > send_timeoffset_req and send_invalidate_req seem only relevant to
> > > > external ioreq servers.
> > >
> > > send_timeoffset_req() is relic which ought to be replaced with another mechanism IMO...
> > >
> > > When an HVM guest writes its RTC, a new 'timeoffset' value (offset of RTC from host time) is
> > calculated (also applied to the PV wallclock) and advertised via this ioreq. In XenServer, this is
> > picked up by QEMU, forwarded via QMP to XAPI and then written into the VM meta-data (which than causes
> > it to be written into xenstore too). All this is so that that guest's RTC can be set correctly when it
> > is rebooted... There has to be a better way (e.g. extracting RTC via hvm context and saving it before
> > cleaning up the domain).
> > >
> > > send_invalidate_req() is relevant for any emulator maintaining a cache of guest->host memory
> > mappings which, I guess, could include internal emulators even if this is not the case at the moment.
> > 
> > Maybe, but I would expect an internal emulator to get a reference on
> > the gfn if it does need to keep it in some kind of cache, or else I
> > don't think code in the hypervisor should be keeping such references.
> 
> Oh indeed, but that's not the issue. The issue is when to drop those refs... If the guest does a decrease_reservation on a gfn cached by the emulator then the emulator needs to drop its ref to allow the page to be freed.

Then I think this also could be used by internal servers.

Thanks, Roger.
diff mbox series

Patch

diff --git a/tools/tests/vpci/Makefile b/tools/tests/vpci/Makefile
index 5075bc2be2..c365c4522a 100644
--- a/tools/tests/vpci/Makefile
+++ b/tools/tests/vpci/Makefile
@@ -25,7 +25,10 @@  install:
 
 vpci.c: $(XEN_ROOT)/xen/drivers/vpci/vpci.c
 	# Remove includes and add the test harness header
-	sed -e '/#include/d' -e '1s/^/#include "emul.h"/' <$< >$@
+	sed -e '/#include/d' -e '1s/^/#include "emul.h"/' \
+	    -e 's/^static uint32_t read/uint32_t vpci_read/' \
+	    -e 's/^static void write/void vpci_write/' <$< >$@
+
 
 list.h: $(XEN_ROOT)/xen/include/xen/list.h
 vpci.h: $(XEN_ROOT)/xen/include/xen/vpci.h
diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h
index 796797fdc2..790c4de601 100644
--- a/tools/tests/vpci/emul.h
+++ b/tools/tests/vpci/emul.h
@@ -125,6 +125,10 @@  typedef union {
         tx > ty ? tx : ty;              \
 })
 
+uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
+void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
+                uint32_t data);
+
 #endif
 
 /*
diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index 1ddbd46b39..c022502bb8 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -29,6 +29,7 @@ 
 
 #include <asm/bzimage.h>
 #include <asm/dom0_build.h>
+#include <asm/hvm/ioreq.h>
 #include <asm/hvm/support.h>
 #include <asm/io_apic.h>
 #include <asm/p2m.h>
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index fec0073618..228c79643d 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -644,10 +644,13 @@  int hvm_domain_initialise(struct domain *d)
         d->arch.hvm.io_bitmap = hvm_io_bitmap;
 
     register_g2m_portio_handler(d);
-    register_vpci_portio_handler(d);
 
     hvm_ioreq_init(d);
 
+    rc = vpci_register_ioreq(d);
+    if ( rc )
+        goto fail1;
+
     hvm_init_guest_time(d);
 
     d->arch.hvm.params[HVM_PARAM_TRIPLE_FAULT_REASON] = SHUTDOWN_reboot;
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 3334888136..4c72e68a5b 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -290,204 +290,6 @@  unsigned int hvm_mmcfg_decode_addr(const struct hvm_mmcfg *mmcfg,
     return addr & (PCI_CFG_SPACE_EXP_SIZE - 1);
 }
 
-
-/* Do some sanity checks. */
-static bool vpci_access_allowed(unsigned int reg, unsigned int len)
-{
-    /* Check access size. */
-    if ( len != 1 && len != 2 && len != 4 && len != 8 )
-        return false;
-
-    /* Check that access is size aligned. */
-    if ( (reg & (len - 1)) )
-        return false;
-
-    return true;
-}
-
-/* vPCI config space IO ports handlers (0xcf8/0xcfc). */
-static bool vpci_portio_accept(const struct hvm_io_handler *handler,
-                               const ioreq_t *p)
-{
-    return (p->addr == 0xcf8 && p->size == 4) || (p->addr & ~3) == 0xcfc;
-}
-
-static int vpci_portio_read(const struct hvm_io_handler *handler,
-                            uint64_t addr, uint32_t size, uint64_t *data)
-{
-    const struct domain *d = current->domain;
-    unsigned int reg;
-    pci_sbdf_t sbdf;
-    uint32_t cf8;
-
-    *data = ~(uint64_t)0;
-
-    if ( addr == 0xcf8 )
-    {
-        ASSERT(size == 4);
-        *data = d->arch.hvm.pci_cf8;
-        return X86EMUL_OKAY;
-    }
-
-    ASSERT((addr & ~3) == 0xcfc);
-    cf8 = ACCESS_ONCE(d->arch.hvm.pci_cf8);
-    if ( !CF8_ENABLED(cf8) )
-        return X86EMUL_UNHANDLEABLE;
-
-    reg = hvm_pci_decode_addr(cf8, addr, &sbdf);
-
-    if ( !vpci_access_allowed(reg, size) )
-        return X86EMUL_OKAY;
-
-    *data = vpci_read(sbdf, reg, size);
-
-    return X86EMUL_OKAY;
-}
-
-static int vpci_portio_write(const struct hvm_io_handler *handler,
-                             uint64_t addr, uint32_t size, uint64_t data)
-{
-    struct domain *d = current->domain;
-    unsigned int reg;
-    pci_sbdf_t sbdf;
-    uint32_t cf8;
-
-    if ( addr == 0xcf8 )
-    {
-        ASSERT(size == 4);
-        d->arch.hvm.pci_cf8 = data;
-        return X86EMUL_OKAY;
-    }
-
-    ASSERT((addr & ~3) == 0xcfc);
-    cf8 = ACCESS_ONCE(d->arch.hvm.pci_cf8);
-    if ( !CF8_ENABLED(cf8) )
-        return X86EMUL_UNHANDLEABLE;
-
-    reg = hvm_pci_decode_addr(cf8, addr, &sbdf);
-
-    if ( !vpci_access_allowed(reg, size) )
-        return X86EMUL_OKAY;
-
-    vpci_write(sbdf, reg, size, data);
-
-    return X86EMUL_OKAY;
-}
-
-static const struct hvm_io_ops vpci_portio_ops = {
-    .accept = vpci_portio_accept,
-    .read = vpci_portio_read,
-    .write = vpci_portio_write,
-};
-
-void register_vpci_portio_handler(struct domain *d)
-{
-    struct hvm_io_handler *handler;
-
-    if ( !has_vpci(d) )
-        return;
-
-    handler = hvm_next_io_handler(d);
-    if ( !handler )
-        return;
-
-    handler->type = IOREQ_TYPE_PIO;
-    handler->ops = &vpci_portio_ops;
-}
-
-/* Handlers to trap PCI MMCFG config accesses. */
-static int vpci_mmcfg_accept(struct vcpu *v, unsigned long addr)
-{
-    struct domain *d = v->domain;
-    bool found;
-
-    read_lock(&d->arch.hvm.mmcfg_lock);
-    found = hvm_is_mmcfg_address(d, addr);
-    read_unlock(&d->arch.hvm.mmcfg_lock);
-
-    return found;
-}
-
-static int vpci_mmcfg_read(struct vcpu *v, unsigned long addr,
-                           unsigned int len, unsigned long *data)
-{
-    struct domain *d = v->domain;
-    const struct hvm_mmcfg *mmcfg;
-    unsigned int reg;
-    pci_sbdf_t sbdf;
-
-    *data = ~0ul;
-
-    read_lock(&d->arch.hvm.mmcfg_lock);
-    mmcfg = hvm_mmcfg_find(d, addr);
-    if ( !mmcfg )
-    {
-        read_unlock(&d->arch.hvm.mmcfg_lock);
-        return X86EMUL_RETRY;
-    }
-
-    reg = hvm_mmcfg_decode_addr(mmcfg, addr, &sbdf);
-    read_unlock(&d->arch.hvm.mmcfg_lock);
-
-    if ( !vpci_access_allowed(reg, len) ||
-         (reg + len) > PCI_CFG_SPACE_EXP_SIZE )
-        return X86EMUL_OKAY;
-
-    /*
-     * According to the PCIe 3.1A specification:
-     *  - Configuration Reads and Writes must usually be DWORD or smaller
-     *    in size.
-     *  - Because Root Complex implementations are not required to support
-     *    accesses to a RCRB that cross DW boundaries [...] software
-     *    should take care not to cause the generation of such accesses
-     *    when accessing a RCRB unless the Root Complex will support the
-     *    access.
-     *  Xen however supports 8byte accesses by splitting them into two
-     *  4byte accesses.
-     */
-    *data = vpci_read(sbdf, reg, min(4u, len));
-    if ( len == 8 )
-        *data |= (uint64_t)vpci_read(sbdf, reg + 4, 4) << 32;
-
-    return X86EMUL_OKAY;
-}
-
-static int vpci_mmcfg_write(struct vcpu *v, unsigned long addr,
-                            unsigned int len, unsigned long data)
-{
-    struct domain *d = v->domain;
-    const struct hvm_mmcfg *mmcfg;
-    unsigned int reg;
-    pci_sbdf_t sbdf;
-
-    read_lock(&d->arch.hvm.mmcfg_lock);
-    mmcfg = hvm_mmcfg_find(d, addr);
-    if ( !mmcfg )
-    {
-        read_unlock(&d->arch.hvm.mmcfg_lock);
-        return X86EMUL_RETRY;
-    }
-
-    reg = hvm_mmcfg_decode_addr(mmcfg, addr, &sbdf);
-    read_unlock(&d->arch.hvm.mmcfg_lock);
-
-    if ( !vpci_access_allowed(reg, len) ||
-         (reg + len) > PCI_CFG_SPACE_EXP_SIZE )
-        return X86EMUL_OKAY;
-
-    vpci_write(sbdf, reg, min(4u, len), data);
-    if ( len == 8 )
-        vpci_write(sbdf, reg + 4, 4, data >> 32);
-
-    return X86EMUL_OKAY;
-}
-
-static const struct hvm_mmio_ops vpci_mmcfg_ops = {
-    .check = vpci_mmcfg_accept,
-    .read = vpci_mmcfg_read,
-    .write = vpci_mmcfg_write,
-};
-
 int hvm_register_mmcfg(struct domain *d, paddr_t addr,
                        unsigned int start_bus, unsigned int end_bus,
                        unsigned int seg)
@@ -525,9 +327,6 @@  int hvm_register_mmcfg(struct domain *d, paddr_t addr,
             return ret;
         }
 
-    if ( list_empty(&d->arch.hvm.mmcfg_regions) && has_vpci(d) )
-        register_mmio_handler(d, &vpci_mmcfg_ops);
-
     list_add(&new->next, &d->arch.hvm.mmcfg_regions);
     write_unlock(&d->arch.hvm.mmcfg_lock);
 
diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index f61f66df5f..bf2c64a0a9 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -11,6 +11,7 @@ 
 #include <asm/current.h>
 #include <asm/io_apic.h>
 #include <asm/msi.h>
+#include <asm/hvm/ioreq.h>
 #include <asm/hvm/irq.h>
 #include <asm/hypercall.h>
 #include <public/xen.h>
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index cbd1bac7fc..5664020c2d 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -20,6 +20,8 @@ 
 #include <xen/sched.h>
 #include <xen/vpci.h>
 
+#include <asm/hvm/ioreq.h>
+
 /* Internal struct to store the emulated PCI registers. */
 struct vpci_register {
     vpci_read_t *read;
@@ -302,7 +304,7 @@  static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
     return (data & ~(mask << (offset * 8))) | ((new & mask) << (offset * 8));
 }
 
-uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
+static uint32_t read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
 {
     const struct domain *d = current->domain;
     const struct pci_dev *pdev;
@@ -404,8 +406,8 @@  static void vpci_write_helper(const struct pci_dev *pdev,
              r->private);
 }
 
-void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
-                uint32_t data)
+static void write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
+                  uint32_t data)
 {
     const struct domain *d = current->domain;
     const struct pci_dev *pdev;
@@ -478,6 +480,67 @@  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
     spin_unlock(&pdev->vpci->lock);
 }
 
+#ifdef __XEN__
+static int ioreq_handler(struct vcpu *v, ioreq_t *req, void *data)
+{
+    pci_sbdf_t sbdf;
+
+    if ( req->type == IOREQ_TYPE_INVALIDATE )
+        /*
+         * Ignore invalidate requests, those can be received even without
+         * having any memory ranges registered, see send_invalidate_req.
+         */
+        return X86EMUL_OKAY;
+
+    if ( req->type != IOREQ_TYPE_PCI_CONFIG || req->data_is_ptr )
+    {
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    sbdf.sbdf = req->addr >> 32;
+
+    if ( req->dir )
+        req->data = read(sbdf, req->addr, req->size);
+    else
+        write(sbdf, req->addr, req->size, req->data);
+
+    return X86EMUL_OKAY;
+}
+
+int vpci_register_ioreq(struct domain *d)
+{
+    ioservid_t id;
+    int rc;
+
+    if ( !has_vpci(d) )
+        return 0;
+
+    rc = hvm_create_ioreq_server(d, HVM_IOREQSRV_BUFIOREQ_OFF, &id, true);
+    if ( rc )
+        return rc;
+
+    rc = hvm_add_ioreq_handler(d, id, ioreq_handler, NULL);
+    if ( rc )
+        return rc;
+
+    if ( is_hardware_domain(d) )
+    {
+        /* Handle all devices in vpci. */
+        rc = hvm_map_io_range_to_ioreq_server(d, id, XEN_DMOP_IO_RANGE_PCI,
+                                              0, ~(uint64_t)0);
+        if ( rc )
+            return rc;
+    }
+
+    rc = hvm_set_ioreq_server_state(d, id, true);
+    if ( rc )
+        return rc;
+
+    return rc;
+}
+#endif
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 4cf233c779..36f435ed5b 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -23,6 +23,9 @@  typedef int vpci_register_init_t(struct pci_dev *dev);
   static vpci_register_init_t *const x##_entry  \
                __used_section(".data.vpci." p) = x
 
+/* Register vPCI handler with ioreq. */
+int vpci_register_ioreq(struct domain *d);
+
 /* Add vPCI handlers to device. */
 int __must_check vpci_add_handlers(struct pci_dev *dev);
 
@@ -38,11 +41,6 @@  int __must_check vpci_add_register(struct vpci *vpci,
 int __must_check vpci_remove_register(struct vpci *vpci, unsigned int offset,
                                       unsigned int size);
 
-/* Generic read/write handlers for the PCI config space. */
-uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
-void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
-                uint32_t data);
-
 /* Passthrough handlers. */
 uint32_t vpci_hw_read16(const struct pci_dev *pdev, unsigned int reg,
                         void *data);
@@ -221,20 +219,12 @@  static inline int vpci_add_handlers(struct pci_dev *pdev)
     return 0;
 }
 
-static inline void vpci_dump_msi(void) { }
-
-static inline uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg,
-                                 unsigned int size)
+static inline int vpci_register_ioreq(struct domain *d)
 {
-    ASSERT_UNREACHABLE();
-    return ~(uint32_t)0;
+    return 0;
 }
 
-static inline void vpci_write(pci_sbdf_t sbdf, unsigned int reg,
-                              unsigned int size, uint32_t data)
-{
-    ASSERT_UNREACHABLE();
-}
+static inline void vpci_dump_msi(void) { }
 
 static inline bool vpci_process_pending(struct vcpu *v)
 {