diff mbox series

[RFC,V1,01/12] hvm/ioreq: Make x86's IOREQ feature common

Message ID 1596478888-23030-2-git-send-email-olekstysh@gmail.com (mailing list archive)
State Superseded
Headers show
Series IOREQ feature (+ virtio-mmio) on Arm | expand

Commit Message

Oleksandr Tyshchenko Aug. 3, 2020, 6:21 p.m. UTC
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>

As a lot of x86 code can be re-used on Arm later on, this patch
splits IOREQ support into common and arch specific parts.

This support is going to be used on Arm to be able run device
emulator outside of Xen hypervisor.

Please note, this is a split/cleanup of Julien's PoC:
"Add support for Guest IO forwarding to a device emulator"

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
---
 xen/arch/x86/Kconfig            |    1 +
 xen/arch/x86/hvm/dm.c           |    2 +-
 xen/arch/x86/hvm/emulate.c      |    2 +-
 xen/arch/x86/hvm/hvm.c          |    2 +-
 xen/arch/x86/hvm/io.c           |    2 +-
 xen/arch/x86/hvm/ioreq.c        | 1431 +--------------------------------------
 xen/arch/x86/hvm/stdvga.c       |    2 +-
 xen/arch/x86/hvm/vmx/realmode.c |    1 +
 xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
 xen/arch/x86/mm.c               |    2 +-
 xen/arch/x86/mm/shadow/common.c |    2 +-
 xen/common/Kconfig              |    3 +
 xen/common/Makefile             |    1 +
 xen/common/hvm/Makefile         |    1 +
 xen/common/hvm/ioreq.c          | 1430 ++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/ioreq.h |   45 +-
 xen/include/asm-x86/hvm/vcpu.h  |    7 -
 xen/include/xen/hvm/ioreq.h     |   89 +++
 18 files changed, 1575 insertions(+), 1450 deletions(-)
 create mode 100644 xen/common/hvm/Makefile
 create mode 100644 xen/common/hvm/ioreq.c
 create mode 100644 xen/include/xen/hvm/ioreq.h

Comments

Paul Durrant Aug. 4, 2020, 7:45 a.m. UTC | #1
> -----Original Message-----
> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
> Sent: 03 August 2020 19:21
> To: xen-devel@lists.xenproject.org
> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>; Julien
> Grall <julien.grall@arm.com>
> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
> 
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> 
> As a lot of x86 code can be re-used on Arm later on, this patch
> splits IOREQ support into common and arch specific parts.
> 
> This support is going to be used on Arm to be able run device
> emulator outside of Xen hypervisor.
> 
> Please note, this is a split/cleanup of Julien's PoC:
> "Add support for Guest IO forwarding to a device emulator"
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
>  xen/arch/x86/Kconfig            |    1 +
>  xen/arch/x86/hvm/dm.c           |    2 +-
>  xen/arch/x86/hvm/emulate.c      |    2 +-
>  xen/arch/x86/hvm/hvm.c          |    2 +-
>  xen/arch/x86/hvm/io.c           |    2 +-
>  xen/arch/x86/hvm/ioreq.c        | 1431 +--------------------------------------
>  xen/arch/x86/hvm/stdvga.c       |    2 +-
>  xen/arch/x86/hvm/vmx/realmode.c |    1 +
>  xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>  xen/arch/x86/mm.c               |    2 +-
>  xen/arch/x86/mm/shadow/common.c |    2 +-
>  xen/common/Kconfig              |    3 +
>  xen/common/Makefile             |    1 +
>  xen/common/hvm/Makefile         |    1 +
>  xen/common/hvm/ioreq.c          | 1430 ++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-x86/hvm/ioreq.h |   45 +-
>  xen/include/asm-x86/hvm/vcpu.h  |    7 -
>  xen/include/xen/hvm/ioreq.h     |   89 +++
>  18 files changed, 1575 insertions(+), 1450 deletions(-)
>  create mode 100644 xen/common/hvm/Makefile
>  create mode 100644 xen/common/hvm/ioreq.c
>  create mode 100644 xen/include/xen/hvm/ioreq.h

You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I wrote most of ioreq.c, please retain me as a maintainer of the common code.

[snip]
> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
>      return X86EMUL_UNHANDLEABLE;
>  }
> 
> -void hvm_ioreq_init(struct domain *d)
> +void arch_hvm_ioreq_init(struct domain *d)
>  {
>      spin_lock_init(&d->arch.hvm.ioreq_server.lock);
> 
>      register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
>  }
> 
> +void arch_hvm_ioreq_destroy(struct domain *d)
> +{
> +    if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
> +        return;

There's not really a lot of point in this. I think an empty function here would be ok.

> +}
> +
>  /*
>   * Local variables:
>   * mode: C

[snip]
> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> +                                                 ioreq_t *p)
> +{
> +    struct hvm_ioreq_server *s;
> +    uint8_t type;
> +    uint64_t addr;
> +    unsigned int id;
> +
> +    if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
> +        return NULL;
> +
> +    hvm_get_ioreq_server_range_type(d, p, &type, &addr);

Looking at this, I think it would make more sense to fold the check of p->type into hvm_get_ioreq_server_range_type() and have it return success/failure.

> +
> +    FOR_EACH_IOREQ_SERVER(d, id, s)
> +    {
> +        struct rangeset *r;
> +
> +        if ( !s->enabled )
> +            continue;
> +
> +        r = s->range[type];
> +
> +        switch ( type )
> +        {
> +            unsigned long start, end;
> +
> +        case XEN_DMOP_IO_RANGE_PORT:
> +            start = addr;
> +            end = start + p->size - 1;
> +            if ( rangeset_contains_range(r, start, end) )
> +                return s;
> +
> +            break;
> +
> +        case XEN_DMOP_IO_RANGE_MEMORY:
> +            start = hvm_mmio_first_byte(p);
> +            end = hvm_mmio_last_byte(p);
> +
> +            if ( rangeset_contains_range(r, start, end) )
> +                return s;
> +
> +            break;
> +
> +        case XEN_DMOP_IO_RANGE_PCI:
> +            if ( rangeset_contains_singleton(r, addr >> 32) )
> +            {
> +                p->type = IOREQ_TYPE_PCI_CONFIG;
> +                p->addr = addr;
> +                return s;
> +            }
> +
> +            break;
> +        }
> +    }
> +
> +    return NULL;
> +}

[snip]
> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> new file mode 100644
> index 0000000..40b7b5e
> --- /dev/null
> +++ b/xen/include/xen/hvm/ioreq.h
> @@ -0,0 +1,89 @@
> +/*
> + * hvm.h: Hardware virtual machine assist interface definitions.
> + *
> + * Copyright (c) 2016 Citrix Systems Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef __HVM_IOREQ_H__
> +#define __HVM_IOREQ_H__
> +
> +#include <xen/sched.h>
> +
> +#include <asm/hvm/ioreq.h>
> +
> +#define GET_IOREQ_SERVER(d, id) \
> +    (d)->arch.hvm.ioreq_server.server[id]
> +
> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> +                                                        unsigned int id)
> +{
> +    if ( id >= MAX_NR_IOREQ_SERVERS )
> +        return NULL;
> +
> +    return GET_IOREQ_SERVER(d, id);
> +}
> +
> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> +{
> +    return ioreq->state == STATE_IOREQ_READY &&
> +           !ioreq->data_is_ptr &&
> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> +}

I don't think having this in common code is correct. The short-cut of not completing PIO reads seems somewhat x86 specific. Does ARM even have the concept of PIO?

  Paul

> +
> +bool hvm_io_pending(struct vcpu *v);
> +bool handle_hvm_io_completion(struct vcpu *v);
> +bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
> +
> +int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
> +                            ioservid_t *id);
> +int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
> +int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
> +                              unsigned long *ioreq_gfn,
> +                              unsigned long *bufioreq_gfn,
> +                              evtchn_port_t *bufioreq_port);
> +int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
> +                               unsigned long idx, mfn_t *mfn);
> +int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
> +                                     uint32_t type, uint64_t start,
> +                                     uint64_t end);
> +int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
> +                                         uint32_t type, uint64_t start,
> +                                         uint64_t end);
> +int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
> +                               bool enabled);
> +
> +int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
> +void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
> +void hvm_destroy_all_ioreq_servers(struct domain *d);
> +
> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> +                                                 ioreq_t *p);
> +int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
> +                   bool buffered);
> +unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
> +
> +void hvm_ioreq_init(struct domain *d);
> +
> +#endif /* __HVM_IOREQ_H__ */
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> --
> 2.7.4
Oleksandr Tyshchenko Aug. 4, 2020, 11:10 a.m. UTC | #2
On 04.08.20 10:45, Paul Durrant wrote:

Hi Paul

>> -----Original Message-----
>> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
>> Sent: 03 August 2020 19:21
>> To: xen-devel@lists.xenproject.org
>> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
>> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
>> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
>> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
>> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>; Julien
>> Grall <julien.grall@arm.com>
>> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
>>
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> As a lot of x86 code can be re-used on Arm later on, this patch
>> splits IOREQ support into common and arch specific parts.
>>
>> This support is going to be used on Arm to be able run device
>> emulator outside of Xen hypervisor.
>>
>> Please note, this is a split/cleanup of Julien's PoC:
>> "Add support for Guest IO forwarding to a device emulator"
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>>   xen/arch/x86/Kconfig            |    1 +
>>   xen/arch/x86/hvm/dm.c           |    2 +-
>>   xen/arch/x86/hvm/emulate.c      |    2 +-
>>   xen/arch/x86/hvm/hvm.c          |    2 +-
>>   xen/arch/x86/hvm/io.c           |    2 +-
>>   xen/arch/x86/hvm/ioreq.c        | 1431 +--------------------------------------
>>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>>   xen/arch/x86/mm.c               |    2 +-
>>   xen/arch/x86/mm/shadow/common.c |    2 +-
>>   xen/common/Kconfig              |    3 +
>>   xen/common/Makefile             |    1 +
>>   xen/common/hvm/Makefile         |    1 +
>>   xen/common/hvm/ioreq.c          | 1430 ++++++++++++++++++++++++++++++++++++++
>>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>>   xen/include/xen/hvm/ioreq.h     |   89 +++
>>   18 files changed, 1575 insertions(+), 1450 deletions(-)
>>   create mode 100644 xen/common/hvm/Makefile
>>   create mode 100644 xen/common/hvm/ioreq.c
>>   create mode 100644 xen/include/xen/hvm/ioreq.h
> You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I wrote most of ioreq.c, please retain me as a maintainer of the common code.

Oh, I completely forgot about MAINTAINERS file. Sure, I will update file 
and retain you.


>
> [snip]
>> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
>>       return X86EMUL_UNHANDLEABLE;
>>   }
>>
>> -void hvm_ioreq_init(struct domain *d)
>> +void arch_hvm_ioreq_init(struct domain *d)
>>   {
>>       spin_lock_init(&d->arch.hvm.ioreq_server.lock);
>>
>>       register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
>>   }
>>
>> +void arch_hvm_ioreq_destroy(struct domain *d)
>> +{
>> +    if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
>> +        return;
> There's not really a lot of point in this. I think an empty function here would be ok.

ok


>> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
>> +                                                 ioreq_t *p)
>> +{
>> +    struct hvm_ioreq_server *s;
>> +    uint8_t type;
>> +    uint64_t addr;
>> +    unsigned int id;
>> +
>> +    if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
>> +        return NULL;
>> +
>> +    hvm_get_ioreq_server_range_type(d, p, &type, &addr);
> Looking at this, I think it would make more sense to fold the check of p->type into hvm_get_ioreq_server_range_type() and have it return success/failure.

ok, will update.


> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
>> new file mode 100644
>> index 0000000..40b7b5e
>> --- /dev/null
>> +++ b/xen/include/xen/hvm/ioreq.h
>> @@ -0,0 +1,89 @@
>> +/*
>> + * hvm.h: Hardware virtual machine assist interface definitions.
>> + *
>> + * Copyright (c) 2016 Citrix Systems Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program; If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef __HVM_IOREQ_H__
>> +#define __HVM_IOREQ_H__
>> +
>> +#include <xen/sched.h>
>> +
>> +#include <asm/hvm/ioreq.h>
>> +
>> +#define GET_IOREQ_SERVER(d, id) \
>> +    (d)->arch.hvm.ioreq_server.server[id]
>> +
>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
>> +                                                        unsigned int id)
>> +{
>> +    if ( id >= MAX_NR_IOREQ_SERVERS )
>> +        return NULL;
>> +
>> +    return GET_IOREQ_SERVER(d, id);
>> +}
>> +
>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>> +{
>> +    return ioreq->state == STATE_IOREQ_READY &&
>> +           !ioreq->data_is_ptr &&
>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
>> +}
> I don't think having this in common code is correct. The short-cut of not completing PIO reads seems somewhat x86 specific. Does ARM even have the concept of PIO?

I am not 100% sure here, but it seems that doesn't have.

Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would 
have the same implementation, but without "ioreq->type != 
IOREQ_TYPE_PIO" check...
Paul Durrant Aug. 4, 2020, 11:23 a.m. UTC | #3
> -----Original Message-----
> From: Oleksandr <olekstysh@gmail.com>
> Sent: 04 August 2020 12:10
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Oleksandr Tyshchenko' <oleksandr_tyshchenko@epam.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Andrew
> Cooper' <andrew.cooper3@citrix.com>; 'Wei Liu' <wl@xen.org>; 'Roger Pau Monné' <roger.pau@citrix.com>;
> 'George Dunlap' <george.dunlap@citrix.com>; 'Ian Jackson' <ian.jackson@eu.citrix.com>; 'Julien Grall'
> <julien@xen.org>; 'Stefano Stabellini' <sstabellini@kernel.org>; 'Jun Nakajima'
> <jun.nakajima@intel.com>; 'Kevin Tian' <kevin.tian@intel.com>; 'Tim Deegan' <tim@xen.org>; 'Julien
> Grall' <julien.grall@arm.com>
> Subject: Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
> 
> 
> On 04.08.20 10:45, Paul Durrant wrote:
> 
> Hi Paul
> 
> >> -----Original Message-----
> >> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
> >> Sent: 03 August 2020 19:21
> >> To: xen-devel@lists.xenproject.org
> >> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
> >> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
> >> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
> >> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
> >> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>;
> Julien
> >> Grall <julien.grall@arm.com>
> >> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
> >>
> >> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> >>
> >> As a lot of x86 code can be re-used on Arm later on, this patch
> >> splits IOREQ support into common and arch specific parts.
> >>
> >> This support is going to be used on Arm to be able run device
> >> emulator outside of Xen hypervisor.
> >>
> >> Please note, this is a split/cleanup of Julien's PoC:
> >> "Add support for Guest IO forwarding to a device emulator"
> >>
> >> Signed-off-by: Julien Grall <julien.grall@arm.com>
> >> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> >> ---
> >>   xen/arch/x86/Kconfig            |    1 +
> >>   xen/arch/x86/hvm/dm.c           |    2 +-
> >>   xen/arch/x86/hvm/emulate.c      |    2 +-
> >>   xen/arch/x86/hvm/hvm.c          |    2 +-
> >>   xen/arch/x86/hvm/io.c           |    2 +-
> >>   xen/arch/x86/hvm/ioreq.c        | 1431 +--------------------------------------
> >>   xen/arch/x86/hvm/stdvga.c       |    2 +-
> >>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
> >>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
> >>   xen/arch/x86/mm.c               |    2 +-
> >>   xen/arch/x86/mm/shadow/common.c |    2 +-
> >>   xen/common/Kconfig              |    3 +
> >>   xen/common/Makefile             |    1 +
> >>   xen/common/hvm/Makefile         |    1 +
> >>   xen/common/hvm/ioreq.c          | 1430 ++++++++++++++++++++++++++++++++++++++
> >>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
> >>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
> >>   xen/include/xen/hvm/ioreq.h     |   89 +++
> >>   18 files changed, 1575 insertions(+), 1450 deletions(-)
> >>   create mode 100644 xen/common/hvm/Makefile
> >>   create mode 100644 xen/common/hvm/ioreq.c
> >>   create mode 100644 xen/include/xen/hvm/ioreq.h
> > You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I
> wrote most of ioreq.c, please retain me as a maintainer of the common code.
> 
> Oh, I completely forgot about MAINTAINERS file. Sure, I will update file
> and retain you.
> 
> 
> >
> > [snip]
> >> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
> >>       return X86EMUL_UNHANDLEABLE;
> >>   }
> >>
> >> -void hvm_ioreq_init(struct domain *d)
> >> +void arch_hvm_ioreq_init(struct domain *d)
> >>   {
> >>       spin_lock_init(&d->arch.hvm.ioreq_server.lock);
> >>
> >>       register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
> >>   }
> >>
> >> +void arch_hvm_ioreq_destroy(struct domain *d)
> >> +{
> >> +    if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
> >> +        return;
> > There's not really a lot of point in this. I think an empty function here would be ok.
> 
> ok
> 
> 
> >> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> >> +                                                 ioreq_t *p)
> >> +{
> >> +    struct hvm_ioreq_server *s;
> >> +    uint8_t type;
> >> +    uint64_t addr;
> >> +    unsigned int id;
> >> +
> >> +    if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
> >> +        return NULL;
> >> +
> >> +    hvm_get_ioreq_server_range_type(d, p, &type, &addr);
> > Looking at this, I think it would make more sense to fold the check of p->type into
> hvm_get_ioreq_server_range_type() and have it return success/failure.
> 
> ok, will update.
> 
> 
> > diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> >> new file mode 100644
> >> index 0000000..40b7b5e
> >> --- /dev/null
> >> +++ b/xen/include/xen/hvm/ioreq.h
> >> @@ -0,0 +1,89 @@
> >> +/*
> >> + * hvm.h: Hardware virtual machine assist interface definitions.
> >> + *
> >> + * Copyright (c) 2016 Citrix Systems Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify it
> >> + * under the terms and conditions of the GNU General Public License,
> >> + * version 2, as published by the Free Software Foundation.
> >> + *
> >> + * This program is distributed in the hope it will be useful, but WITHOUT
> >> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> >> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> >> + * more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along with
> >> + * this program; If not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef __HVM_IOREQ_H__
> >> +#define __HVM_IOREQ_H__
> >> +
> >> +#include <xen/sched.h>
> >> +
> >> +#include <asm/hvm/ioreq.h>
> >> +
> >> +#define GET_IOREQ_SERVER(d, id) \
> >> +    (d)->arch.hvm.ioreq_server.server[id]
> >> +
> >> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> >> +                                                        unsigned int id)
> >> +{
> >> +    if ( id >= MAX_NR_IOREQ_SERVERS )
> >> +        return NULL;
> >> +
> >> +    return GET_IOREQ_SERVER(d, id);
> >> +}
> >> +
> >> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> >> +{
> >> +    return ioreq->state == STATE_IOREQ_READY &&
> >> +           !ioreq->data_is_ptr &&
> >> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> >> +}
> > I don't think having this in common code is correct. The short-cut of not completing PIO reads seems
> somewhat x86 specific. Does ARM even have the concept of PIO?
> 
> I am not 100% sure here, but it seems that doesn't have.
> 
> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
> have the same implementation, but without "ioreq->type !=
> IOREQ_TYPE_PIO" check...
> 

With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect it will remain x86 specific, without any need for an Arm variant.

  Paul

> --
> Regards,
> 
> Oleksandr Tyshchenko
Oleksandr Tyshchenko Aug. 4, 2020, 11:51 a.m. UTC | #4
On 04.08.20 14:23, Paul Durrant wrote:
>>
>>> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
>>>> new file mode 100644
>>>> index 0000000..40b7b5e
>>>> --- /dev/null
>>>> +++ b/xen/include/xen/hvm/ioreq.h
>>>> @@ -0,0 +1,89 @@
>>>> +/*
>>>> + * hvm.h: Hardware virtual machine assist interface definitions.
>>>> + *
>>>> + * Copyright (c) 2016 Citrix Systems Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify it
>>>> + * under the terms and conditions of the GNU General Public License,
>>>> + * version 2, as published by the Free Software Foundation.
>>>> + *
>>>> + * This program is distributed in the hope it will be useful, but WITHOUT
>>>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>>>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>>>> + * more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along with
>>>> + * this program; If not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef __HVM_IOREQ_H__
>>>> +#define __HVM_IOREQ_H__
>>>> +
>>>> +#include <xen/sched.h>
>>>> +
>>>> +#include <asm/hvm/ioreq.h>
>>>> +
>>>> +#define GET_IOREQ_SERVER(d, id) \
>>>> +    (d)->arch.hvm.ioreq_server.server[id]
>>>> +
>>>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
>>>> +                                                        unsigned int id)
>>>> +{
>>>> +    if ( id >= MAX_NR_IOREQ_SERVERS )
>>>> +        return NULL;
>>>> +
>>>> +    return GET_IOREQ_SERVER(d, id);
>>>> +}
>>>> +
>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>> +{
>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>> +           !ioreq->data_is_ptr &&
>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
>>>> +}
>>> I don't think having this in common code is correct. The short-cut of not completing PIO reads seems
>> somewhat x86 specific. Does ARM even have the concept of PIO?
>>
>> I am not 100% sure here, but it seems that doesn't have.
>>
>> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
>> have the same implementation, but without "ioreq->type !=
>> IOREQ_TYPE_PIO" check...
>>
> With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect it will remain x86 specific, without any need for an Arm variant.
Yes, it does. Please see common usage in hvm_io_assist() and 
handle_hvm_io_completion() (current patch) and usage in Arm code 
(arch/arm/io.c: io_state try_fwd_ioserv) [1]


[1] 
https://lists.xenproject.org/archives/html/xen-devel/2020-08/msg00072.html
Paul Durrant Aug. 4, 2020, 1:18 p.m. UTC | #5
> -----Original Message-----
> From: Oleksandr <olekstysh@gmail.com>
> Sent: 04 August 2020 12:51
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Oleksandr Tyshchenko' <oleksandr_tyshchenko@epam.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Andrew
> Cooper' <andrew.cooper3@citrix.com>; 'Wei Liu' <wl@xen.org>; 'Roger Pau Monné' <roger.pau@citrix.com>;
> 'George Dunlap' <george.dunlap@citrix.com>; 'Ian Jackson' <ian.jackson@eu.citrix.com>; 'Julien Grall'
> <julien@xen.org>; 'Stefano Stabellini' <sstabellini@kernel.org>; 'Jun Nakajima'
> <jun.nakajima@intel.com>; 'Kevin Tian' <kevin.tian@intel.com>; 'Tim Deegan' <tim@xen.org>; 'Julien
> Grall' <julien.grall@arm.com>
> Subject: Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
> 
> 
> On 04.08.20 14:23, Paul Durrant wrote:
> >>
> >>> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> >>>> new file mode 100644
> >>>> index 0000000..40b7b5e
> >>>> --- /dev/null
> >>>> +++ b/xen/include/xen/hvm/ioreq.h
> >>>> @@ -0,0 +1,89 @@
> >>>> +/*
> >>>> + * hvm.h: Hardware virtual machine assist interface definitions.
> >>>> + *
> >>>> + * Copyright (c) 2016 Citrix Systems Inc.
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify it
> >>>> + * under the terms and conditions of the GNU General Public License,
> >>>> + * version 2, as published by the Free Software Foundation.
> >>>> + *
> >>>> + * This program is distributed in the hope it will be useful, but WITHOUT
> >>>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> >>>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> >>>> + * more details.
> >>>> + *
> >>>> + * You should have received a copy of the GNU General Public License along with
> >>>> + * this program; If not, see <http://www.gnu.org/licenses/>.
> >>>> + */
> >>>> +
> >>>> +#ifndef __HVM_IOREQ_H__
> >>>> +#define __HVM_IOREQ_H__
> >>>> +
> >>>> +#include <xen/sched.h>
> >>>> +
> >>>> +#include <asm/hvm/ioreq.h>
> >>>> +
> >>>> +#define GET_IOREQ_SERVER(d, id) \
> >>>> +    (d)->arch.hvm.ioreq_server.server[id]
> >>>> +
> >>>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> >>>> +                                                        unsigned int id)
> >>>> +{
> >>>> +    if ( id >= MAX_NR_IOREQ_SERVERS )
> >>>> +        return NULL;
> >>>> +
> >>>> +    return GET_IOREQ_SERVER(d, id);
> >>>> +}
> >>>> +
> >>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> >>>> +{
> >>>> +    return ioreq->state == STATE_IOREQ_READY &&
> >>>> +           !ioreq->data_is_ptr &&
> >>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> >>>> +}
> >>> I don't think having this in common code is correct. The short-cut of not completing PIO reads
> seems
> >> somewhat x86 specific. Does ARM even have the concept of PIO?
> >>
> >> I am not 100% sure here, but it seems that doesn't have.
> >>
> >> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
> >> have the same implementation, but without "ioreq->type !=
> >> IOREQ_TYPE_PIO" check...
> >>
> > With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect
> it will remain x86 specific, without any need for an Arm variant.
> Yes, it does. Please see common usage in hvm_io_assist() and
> handle_hvm_io_completion() (current patch) and usage in Arm code
> (arch/arm/io.c: io_state try_fwd_ioserv) [1]
> 
> 
> [1]
> https://lists.xenproject.org/archives/html/xen-devel/2020-08/msg00072.html
> 

Yes, but that code is clearly not finished since, after setting io_completion it says:

/* XXX: Decide what to do */
if ( rc == IO_RETRY )
    rc = IO_HANDLED;

So, it's not clear what the eventual implementation will be and whether it will need to make that call.

  Paul

> 
> --
> Regards,
> 
> Oleksandr Tyshchenko
Julien Grall Aug. 4, 2020, 1:52 p.m. UTC | #6
Hi,

On 04/08/2020 12:10, Oleksandr wrote:
> On 04.08.20 10:45, Paul Durrant wrote:
>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>> +{
>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>> +           !ioreq->data_is_ptr &&
>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != 
>>> IOREQ_WRITE);
>>> +}
>> I don't think having this in common code is correct. The short-cut of 
>> not completing PIO reads seems somewhat x86 specific. 

Hmmm, looking at the code, I think it doesn't wait for PIO writes to 
complete (not read). Did I miss anything?

> Does ARM even 
>> have the concept of PIO?
> 
> I am not 100% sure here, but it seems that doesn't have.

Technically, the PIOs exist on Arm, however they are accessed the same 
way as MMIO and will have a dedicated area defined by the HW.

AFAICT, on Arm64, they are only used for PCI IO Bar.

Now the question is whether we want to expose them to the Device 
Emulator as PIO or MMIO access. From a generic PoV, a DM shouldn't have 
to care about the architecture used. It should just be able to request a 
given IOport region.

So it may make sense to differentiate them in the common ioreq code as well.

I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs 
address space are different on Arm as well. Paul, Stefano, do you know 
what they are doing?

Cheers,
Jan Beulich Aug. 4, 2020, 3:41 p.m. UTC | #7
On 04.08.2020 15:52, Julien Grall wrote:
> On 04/08/2020 12:10, Oleksandr wrote:
>> On 04.08.20 10:45, Paul Durrant wrote:
>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>> +{
>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>> +           !ioreq->data_is_ptr &&
>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != 
>>>> IOREQ_WRITE);
>>>> +}
>>> I don't think having this in common code is correct. The short-cut of 
>>> not completing PIO reads seems somewhat x86 specific. 
> 
> Hmmm, looking at the code, I think it doesn't wait for PIO writes to 
> complete (not read). Did I miss anything?

The point of the check isn't to determine whether to wait, but
what to do after having waited. Reads need a retry round through
the emulator (to store the result in the designated place),
while writes don't have such a requirement (and hence guest
execution can continue immediately in the general case).

Jan
Stefano Stabellini Aug. 4, 2020, 7:11 p.m. UTC | #8
On Tue, 4 Aug 2020, Julien Grall wrote:
> On 04/08/2020 12:10, Oleksandr wrote:
> > On 04.08.20 10:45, Paul Durrant wrote:
> > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> > > > +{
> > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > +           !ioreq->data_is_ptr &&
> > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > IOREQ_WRITE);
> > > > +}
> > > I don't think having this in common code is correct. The short-cut of not
> > > completing PIO reads seems somewhat x86 specific. 
> 
> Hmmm, looking at the code, I think it doesn't wait for PIO writes to complete
> (not read). Did I miss anything?
> 
> > Does ARM even 
> > > have the concept of PIO?
> > 
> > I am not 100% sure here, but it seems that doesn't have.
> 
> Technically, the PIOs exist on Arm, however they are accessed the same way as
> MMIO and will have a dedicated area defined by the HW.
> 
> AFAICT, on Arm64, they are only used for PCI IO Bar.
> 
> Now the question is whether we want to expose them to the Device Emulator as
> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about the
> architecture used. It should just be able to request a given IOport region.
> 
> So it may make sense to differentiate them in the common ioreq code as well.
> 
> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs address
> space are different on Arm as well. Paul, Stefano, do you know what they are
> doing?

On the QEMU side, it looks like PIO (address_space_io) is used in
connection with the emulation of the "in" or "out" instructions, see
ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
space regardless of the architecture, such as
hw/pci/pci_bridge.c:pci_bridge_initfn.

However, because there is no "in" and "out" on ARM, I don't think
address_space_io can be accessed. Specifically, there is no equivalent
for target/i386/misc_helper.c:helper_inb on ARM.

So I think PIO is unused on ARM in QEMU.


FYI the ioreq type for PCI conf space reads and writes is
IOREQ_TYPE_PCI_CONFIG (neither MMIO nor PIO) which is implemented as
pci_host_config_read_common/pci_host_config_write_common directly
(neither PIO nor MMIO).


It looks like PIO-specific things could be kept x86-specific, without
loss of functionalities on the ARM side.


> The point of the check isn't to determine whether to wait, but
> what to do after having waited. Reads need a retry round through
> the emulator (to store the result in the designated place),
> while writes don't have such a requirement (and hence guest
> execution can continue immediately in the general case).

The x86 code looks like this:

            rc = hvm_send_ioreq(s, &p, 0);
            if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
                vio->io_req.state = STATE_IOREQ_NONE;
            else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
                rc = X86EMUL_OKAY;

Basically hvm_send_ioreq is expected to return RETRY.
Then, if it is a PIO write operation only, it is turned into OKAY right
away. Otherwise, rc stays as RETRY.

So, normally, hvmemul_do_io is expected to return RETRY, because the
emulator is not done yet. Am I understanding the code correctly?

If so, who is handling RETRY on x86? It tried to follow the call chain
but ended up in the x86 emulator and got lost :-)


At some point later, after the emulator (QEMU) has completed the
request, handle_hvm_io_completion gets called which ends up calling
handle_mmio() finishing the job on the Xen side too.


In other words:
RETRY ==> emulation in progress
OKAY  ==> emulation completed


Is that correct?
Jan Beulich Aug. 5, 2020, 7:01 a.m. UTC | #9
On 04.08.2020 21:11, Stefano Stabellini wrote:
>> The point of the check isn't to determine whether to wait, but
>> what to do after having waited. Reads need a retry round through
>> the emulator (to store the result in the designated place),
>> while writes don't have such a requirement (and hence guest
>> execution can continue immediately in the general case).
> 
> The x86 code looks like this:
> 
>             rc = hvm_send_ioreq(s, &p, 0);
>             if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
>                 vio->io_req.state = STATE_IOREQ_NONE;
>             else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
>                 rc = X86EMUL_OKAY;
> 
> Basically hvm_send_ioreq is expected to return RETRY.
> Then, if it is a PIO write operation only, it is turned into OKAY right
> away. Otherwise, rc stays as RETRY.
> 
> So, normally, hvmemul_do_io is expected to return RETRY, because the
> emulator is not done yet. Am I understanding the code correctly?

"The emulator" unfortunately is ambiguous here: Do you mean qemu
(or whichever else ioreq server) or the x86 emulator inside Xen?
There are various conditions leading to RETRY. As far as
hvm_send_ioreq() goes, it is expected to return RETRY whenever
some sort of response is to be expected (the most notable
exception being the hvm_send_buffered_ioreq() path), or when
submitting the request isn't possible in the first place.

> If so, who is handling RETRY on x86? It tried to follow the call chain
> but ended up in the x86 emulator and got lost :-)

Not sure I understand the question correctly, but I'll try an
answer nevertheless: hvm_send_ioreq() arranges for the vCPU to be
put to sleep (prepare_wait_on_xen_event_channel()). Once the event
channel got signaled (and vCPU unblocked), hvm_do_resume() ->
handle_hvm_io_completion() -> hvm_wait_for_io() then check whether
the wait reason has been satisfied (wait_on_xen_event_channel()),
and ...

> At some point later, after the emulator (QEMU) has completed the
> request, handle_hvm_io_completion gets called which ends up calling
> handle_mmio() finishing the job on the Xen side too.

..., as you say, handle_hvm_io_completion() invokes the retry of
the original operation (handle_mmio() or handle_pio() in
particular) if need be.

What's potentially confusing is that there's a second form of
retry, invoked by the x86 insn emulator itself when it needs to
split complex insns (the repeated string insns being the most
important example). This results in actually exiting back to guest
context without having advanced rIP, but after having updated
other register state suitably (to express the progress made so
far).

Jan
Julien Grall Aug. 5, 2020, 8:33 a.m. UTC | #10
Hi,

On 04/08/2020 20:11, Stefano Stabellini wrote:
> On Tue, 4 Aug 2020, Julien Grall wrote:
>> On 04/08/2020 12:10, Oleksandr wrote:
>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>>> +{
>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>> +           !ioreq->data_is_ptr &&
>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>> IOREQ_WRITE);
>>>>> +}
>>>> I don't think having this in common code is correct. The short-cut of not
>>>> completing PIO reads seems somewhat x86 specific.
>>
>> Hmmm, looking at the code, I think it doesn't wait for PIO writes to complete
>> (not read). Did I miss anything?
>>
>>> Does ARM even
>>>> have the concept of PIO?
>>>
>>> I am not 100% sure here, but it seems that doesn't have.
>>
>> Technically, the PIOs exist on Arm, however they are accessed the same way as
>> MMIO and will have a dedicated area defined by the HW.
>>
>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>
>> Now the question is whether we want to expose them to the Device Emulator as
>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about the
>> architecture used. It should just be able to request a given IOport region.
>>
>> So it may make sense to differentiate them in the common ioreq code as well.
>>
>> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs address
>> space are different on Arm as well. Paul, Stefano, do you know what they are
>> doing?
> 
> On the QEMU side, it looks like PIO (address_space_io) is used in
> connection with the emulation of the "in" or "out" instructions, see
> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> space regardless of the architecture, such as
> hw/pci/pci_bridge.c:pci_bridge_initfn.
> 
> However, because there is no "in" and "out" on ARM, I don't think
> address_space_io can be accessed. Specifically, there is no equivalent
> for target/i386/misc_helper.c:helper_inb on ARM.

So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?

Cheers,
Julien Grall Aug. 5, 2020, 1:30 p.m. UTC | #11
Hi,

On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> 
> As a lot of x86 code can be re-used on Arm later on, this patch
> splits IOREQ support into common and arch specific parts.
> 
> This support is going to be used on Arm to be able run device
> emulator outside of Xen hypervisor.
> 
> Please note, this is a split/cleanup of Julien's PoC:
> "Add support for Guest IO forwarding to a device emulator"
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
>   xen/arch/x86/Kconfig            |    1 +
>   xen/arch/x86/hvm/dm.c           |    2 +-
>   xen/arch/x86/hvm/emulate.c      |    2 +-
>   xen/arch/x86/hvm/hvm.c          |    2 +-
>   xen/arch/x86/hvm/io.c           |    2 +-
>   xen/arch/x86/hvm/ioreq.c        | 1431 +--------------------------------------
>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>   xen/arch/x86/mm.c               |    2 +-
>   xen/arch/x86/mm/shadow/common.c |    2 +-
>   xen/common/Kconfig              |    3 +
>   xen/common/Makefile             |    1 +
>   xen/common/hvm/Makefile         |    1 +
>   xen/common/hvm/ioreq.c          | 1430 ++++++++++++++++++++++++++++++++++++++
>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>   xen/include/xen/hvm/ioreq.h     |   89 +++
>   18 files changed, 1575 insertions(+), 1450 deletions(-)

That's quite a lot of code moved in a single patch. How can we check the 
code moved is still correct? Is it a verbatim copy?

>   create mode 100644 xen/common/hvm/Makefile
>   create mode 100644 xen/common/hvm/ioreq.c
>   create mode 100644 xen/include/xen/hvm/ioreq.h

[...]

> +static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
> +{
> +    unsigned int prev_state = STATE_IOREQ_NONE;
> +
> +    while ( sv->pending )
> +    {
> +        unsigned int state = p->state;
> +
> +        smp_rmb();
> +
> +    recheck:
> +        if ( unlikely(state == STATE_IOREQ_NONE) )
> +        {
> +            /*
> +             * The only reason we should see this case is when an
> +             * emulator is dying and it races with an I/O being
> +             * requested.
> +             */
> +            hvm_io_assist(sv, ~0ul);
> +            break;
> +        }
> +
> +        if ( unlikely(state < prev_state) )
> +        {
> +            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
> +                     prev_state, state);
> +            sv->pending = false;
> +            domain_crash(sv->vcpu->domain);
> +            return false; /* bail */
> +        }
> +
> +        switch ( prev_state = state )
> +        {
> +        case STATE_IORESP_READY: /* IORESP_READY -> NONE */
> +            p->state = STATE_IOREQ_NONE;
> +            hvm_io_assist(sv, p->data);
> +            break;
> +        case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
> +        case STATE_IOREQ_INPROCESS:
> +            wait_on_xen_event_channel(sv->ioreq_evtchn,
> +                                      ({ state = p->state;
> +                                         smp_rmb();
> +                                         state != prev_state; }));
> +            goto recheck;

I recall some discussion on security@ about this specific code. An IOREQ 
server can be destroyed at any time. When destroying IOREQ server, the 
all the vCPUs will be paused to avoid race.

On x86, this was considered to be safe because
wait_on_xen_event_channel() will never return if the vCPU is re-scheduled.

However, on Arm, this function will return even after rescheduling. In 
this case, sv and p may point to invalid memory.

IIRC, the suggestion was to harden hvm_wait_for_io(). I guess we could 
fetch the sv and p after wait_on_xen_event_channel.

Any opinions?

Cheers,
Andrew Cooper Aug. 5, 2020, 4:15 p.m. UTC | #12
On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
> diff --git a/xen/common/Makefile b/xen/common/Makefile
> index 06881d0..f6fc3f8 100644
> --- a/xen/common/Makefile
> +++ b/xen/common/Makefile
> @@ -70,6 +70,7 @@ extra-y := symbols-dummy.o
>  
>  obj-$(CONFIG_COVERAGE) += coverage/
>  obj-y += sched/
> +obj-$(CONFIG_IOREQ_SERVER) += hvm/
>  obj-$(CONFIG_UBSAN) += ubsan/
>  
>  obj-$(CONFIG_NEEDS_LIBELF) += libelf/
> diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
> new file mode 100644
> index 0000000..326215d
> --- /dev/null
> +++ b/xen/common/hvm/Makefile
> @@ -0,0 +1 @@
> +obj-y += ioreq.o
> diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
> new file mode 100644
> index 0000000..7e1fa23
> --- /dev/null
> +++ b/xen/common/hvm/ioreq.c
> <snip>

HVM is an internal detail of arch specific code.  It should not escape
into common code.

From x86's point of view, there is nothing conceptually wrong with
having an IOREQ server for PV guests, although it is very unlikely at
this point that adding support would be a good use of time.

Please make this into a proper top-level common set of functionality.

~Andrew
Stefano Stabellini Aug. 6, 2020, 12:37 a.m. UTC | #13
On Wed, 5 Aug 2020, Jan Beulich wrote:
> On 04.08.2020 21:11, Stefano Stabellini wrote:
> >> The point of the check isn't to determine whether to wait, but
> >> what to do after having waited. Reads need a retry round through
> >> the emulator (to store the result in the designated place),
> >> while writes don't have such a requirement (and hence guest
> >> execution can continue immediately in the general case).
> > 
> > The x86 code looks like this:
> > 
> >             rc = hvm_send_ioreq(s, &p, 0);
> >             if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
> >                 vio->io_req.state = STATE_IOREQ_NONE;
> >             else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
> >                 rc = X86EMUL_OKAY;
> > 
> > Basically hvm_send_ioreq is expected to return RETRY.
> > Then, if it is a PIO write operation only, it is turned into OKAY right
> > away. Otherwise, rc stays as RETRY.
> > 
> > So, normally, hvmemul_do_io is expected to return RETRY, because the
> > emulator is not done yet. Am I understanding the code correctly?
> 
> "The emulator" unfortunately is ambiguous here: Do you mean qemu
> (or whichever else ioreq server) or the x86 emulator inside Xen?

I meant QEMU. I'll use "QEMU" instead of "emulator" in this thread going
forward for clarity.


> There are various conditions leading to RETRY. As far as
> hvm_send_ioreq() goes, it is expected to return RETRY whenever
> some sort of response is to be expected (the most notable
> exception being the hvm_send_buffered_ioreq() path), or when
> submitting the request isn't possible in the first place.
> 
> > If so, who is handling RETRY on x86? It tried to follow the call chain
> > but ended up in the x86 emulator and got lost :-)
> 
> Not sure I understand the question correctly, but I'll try an
> answer nevertheless: hvm_send_ioreq() arranges for the vCPU to be
> put to sleep (prepare_wait_on_xen_event_channel()). Once the event
> channel got signaled (and vCPU unblocked), hvm_do_resume() ->
> handle_hvm_io_completion() -> hvm_wait_for_io() then check whether
> the wait reason has been satisfied (wait_on_xen_event_channel()),
> and ...
> 
> > At some point later, after the emulator (QEMU) has completed the
> > request, handle_hvm_io_completion gets called which ends up calling
> > handle_mmio() finishing the job on the Xen side too.
> 
> ..., as you say, handle_hvm_io_completion() invokes the retry of
> the original operation (handle_mmio() or handle_pio() in
> particular) if need be.

OK, thanks for the details. My interpretation seems to be correct.

In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
also needs to handle try_handle_mmio returning IO_RETRY the first
around, and IO_HANDLED when after QEMU does its job.

What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
early and let the scheduler do its job? Something like:

            enum io_state state = try_handle_mmio(regs, hsr, gpa);

            switch ( state )
            {
            case IO_ABORT:
                goto inject_abt;
            case IO_HANDLED:
                advance_pc(regs, hsr);
                return;
            case IO_RETRY:
                /* finish later */
                return;
            case IO_UNHANDLED:
                /* IO unhandled, try another way to handle it. */
                break;
            default:
                ASSERT_UNREACHABLE();
            }

Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
handle_hvm_io_completion() after QEMU completes the emulation. Today,
handle_mmio just sets the user register with the read value.

But it would be better if it called again the original function
do_trap_stage2_abort_guest to actually retry the original operation.
This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
counter) completing the handling of this instruction.

The user register with the read value could be set by try_handle_mmio if
try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.

Is that how the state machine is expected to work?


> What's potentially confusing is that there's a second form of
> retry, invoked by the x86 insn emulator itself when it needs to
> split complex insns (the repeated string insns being the most
> important example). This results in actually exiting back to guest
> context without having advanced rIP, but after having updated
> other register state suitably (to express the progress made so
> far).

Ah! And it seems to be exactly the same label: X86EMUL_RETRY. It would
be a good idea to differentiate between them.
Stefano Stabellini Aug. 6, 2020, 12:37 a.m. UTC | #14
On Wed, 5 Aug 2020, Julien Grall wrote:
> On 04/08/2020 20:11, Stefano Stabellini wrote:
> > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> > > > > > +{
> > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > +           !ioreq->data_is_ptr &&
> > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > > > IOREQ_WRITE);
> > > > > > +}
> > > > > I don't think having this in common code is correct. The short-cut of
> > > > > not
> > > > > completing PIO reads seems somewhat x86 specific.
> > > 
> > > Hmmm, looking at the code, I think it doesn't wait for PIO writes to
> > > complete
> > > (not read). Did I miss anything?
> > > 
> > > > Does ARM even
> > > > > have the concept of PIO?
> > > > 
> > > > I am not 100% sure here, but it seems that doesn't have.
> > > 
> > > Technically, the PIOs exist on Arm, however they are accessed the same way
> > > as
> > > MMIO and will have a dedicated area defined by the HW.
> > > 
> > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > > 
> > > Now the question is whether we want to expose them to the Device Emulator
> > > as
> > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about
> > > the
> > > architecture used. It should just be able to request a given IOport
> > > region.
> > > 
> > > So it may make sense to differentiate them in the common ioreq code as
> > > well.
> > > 
> > > I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
> > > address
> > > space are different on Arm as well. Paul, Stefano, do you know what they
> > > are
> > > doing?
> > 
> > On the QEMU side, it looks like PIO (address_space_io) is used in
> > connection with the emulation of the "in" or "out" instructions, see
> > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> > space regardless of the architecture, such as
> > hw/pci/pci_bridge.c:pci_bridge_initfn.
> > 
> > However, because there is no "in" and "out" on ARM, I don't think
> > address_space_io can be accessed. Specifically, there is no equivalent
> > for target/i386/misc_helper.c:helper_inb on ARM.
> 
> So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?

PIO is also memory mapped on ARM and it seems to have its own MMIO
address window.
Jan Beulich Aug. 6, 2020, 6:59 a.m. UTC | #15
On 06.08.2020 02:37, Stefano Stabellini wrote:
> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> early and let the scheduler do its job? Something like:
> 
>             enum io_state state = try_handle_mmio(regs, hsr, gpa);
> 
>             switch ( state )
>             {
>             case IO_ABORT:
>                 goto inject_abt;
>             case IO_HANDLED:
>                 advance_pc(regs, hsr);
>                 return;
>             case IO_RETRY:
>                 /* finish later */
>                 return;
>             case IO_UNHANDLED:
>                 /* IO unhandled, try another way to handle it. */
>                 break;
>             default:
>                 ASSERT_UNREACHABLE();
>             }
> 
> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> handle_hvm_io_completion() after QEMU completes the emulation. Today,
> handle_mmio just sets the user register with the read value.
> 
> But it would be better if it called again the original function
> do_trap_stage2_abort_guest to actually retry the original operation.
> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
> counter) completing the handling of this instruction.
> 
> The user register with the read value could be set by try_handle_mmio if
> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
> 
> Is that how the state machine is expected to work?

I think so. Just because it has taken us quite some time (years) on
the x86 side to get reasonably close to how hardware would behave
(I think we're still not fully there): The re-execution path needs
to make sure it observes exactly the same machine state as the
original path did. In particular changes to memory (by another vCPU)
must not be observed.

Jan
Oleksandr Tyshchenko Aug. 6, 2020, 8:20 a.m. UTC | #16
On 05.08.20 19:15, Andrew Cooper wrote:

Hi Andrew

> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>> diff --git a/xen/common/Makefile b/xen/common/Makefile
>> index 06881d0..f6fc3f8 100644
>> --- a/xen/common/Makefile
>> +++ b/xen/common/Makefile
>> @@ -70,6 +70,7 @@ extra-y := symbols-dummy.o
>>   
>>   obj-$(CONFIG_COVERAGE) += coverage/
>>   obj-y += sched/
>> +obj-$(CONFIG_IOREQ_SERVER) += hvm/
>>   obj-$(CONFIG_UBSAN) += ubsan/
>>   
>>   obj-$(CONFIG_NEEDS_LIBELF) += libelf/
>> diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
>> new file mode 100644
>> index 0000000..326215d
>> --- /dev/null
>> +++ b/xen/common/hvm/Makefile
>> @@ -0,0 +1 @@
>> +obj-y += ioreq.o
>> diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
>> new file mode 100644
>> index 0000000..7e1fa23
>> --- /dev/null
>> +++ b/xen/common/hvm/ioreq.c
>> <snip>
> HVM is an internal detail of arch specific code.  It should not escape
> into common code.
>
>  From x86's point of view, there is nothing conceptually wrong with
> having an IOREQ server for PV guests, although it is very unlikely at
> this point that adding support would be a good use of time.

Got it.


> Please make this into a proper top-level common set of functionality.

ok.
Julien Grall Aug. 6, 2020, 9:45 a.m. UTC | #17
Hi,

On 06/08/2020 01:37, Stefano Stabellini wrote:
> On Wed, 5 Aug 2020, Julien Grall wrote:
>> On 04/08/2020 20:11, Stefano Stabellini wrote:
>>> On Tue, 4 Aug 2020, Julien Grall wrote:
>>>> On 04/08/2020 12:10, Oleksandr wrote:
>>>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>>>>> +{
>>>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>>>> +           !ioreq->data_is_ptr &&
>>>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>>>> IOREQ_WRITE);
>>>>>>> +}
>>>>>> I don't think having this in common code is correct. The short-cut of
>>>>>> not
>>>>>> completing PIO reads seems somewhat x86 specific.
>>>>
>>>> Hmmm, looking at the code, I think it doesn't wait for PIO writes to
>>>> complete
>>>> (not read). Did I miss anything?
>>>>
>>>>> Does ARM even
>>>>>> have the concept of PIO?
>>>>>
>>>>> I am not 100% sure here, but it seems that doesn't have.
>>>>
>>>> Technically, the PIOs exist on Arm, however they are accessed the same way
>>>> as
>>>> MMIO and will have a dedicated area defined by the HW.
>>>>
>>>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>>>
>>>> Now the question is whether we want to expose them to the Device Emulator
>>>> as
>>>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about
>>>> the
>>>> architecture used. It should just be able to request a given IOport
>>>> region.
>>>>
>>>> So it may make sense to differentiate them in the common ioreq code as
>>>> well.
>>>>
>>>> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
>>>> address
>>>> space are different on Arm as well. Paul, Stefano, do you know what they
>>>> are
>>>> doing?
>>>
>>> On the QEMU side, it looks like PIO (address_space_io) is used in
>>> connection with the emulation of the "in" or "out" instructions, see
>>> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
>>> space regardless of the architecture, such as
>>> hw/pci/pci_bridge.c:pci_bridge_initfn.
>>>
>>> However, because there is no "in" and "out" on ARM, I don't think
>>> address_space_io can be accessed. Specifically, there is no equivalent
>>> for target/i386/misc_helper.c:helper_inb on ARM.
>>
>> So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?
> 
> PIO is also memory mapped on ARM and it seems to have its own MMIO
> address window.
This part is already well-understood :). However, this only tell us how 
an OS is accessing a PIO.

What I am trying to figure out is how the hardware (or QEMU) is meant to 
work.

 From my understanding, the MMIO access will be received by the 
hostbridge and then forwarded to the appropriate PCI device. The two 
questions I am trying to answer is: How the I/O BARs are configured? 
Will it contain an MMIO address or an offset?

If the answer is the latter, then we will need PIO because a DM will 
never see the MMIO address (the hostbridge will be emulated in Xen).

I am still trying to navigate through the code and didn't manage to find 
an answer so far.

Cheers,
Oleksandr Tyshchenko Aug. 6, 2020, 11:37 a.m. UTC | #18
On 05.08.20 16:30, Julien Grall wrote:
> Hi,

Hi Julien


>
> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> As a lot of x86 code can be re-used on Arm later on, this patch
>> splits IOREQ support into common and arch specific parts.
>>
>> This support is going to be used on Arm to be able run device
>> emulator outside of Xen hypervisor.
>>
>> Please note, this is a split/cleanup of Julien's PoC:
>> "Add support for Guest IO forwarding to a device emulator"
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>>   xen/arch/x86/Kconfig            |    1 +
>>   xen/arch/x86/hvm/dm.c           |    2 +-
>>   xen/arch/x86/hvm/emulate.c      |    2 +-
>>   xen/arch/x86/hvm/hvm.c          |    2 +-
>>   xen/arch/x86/hvm/io.c           |    2 +-
>>   xen/arch/x86/hvm/ioreq.c        | 1431 
>> +--------------------------------------
>>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>>   xen/arch/x86/mm.c               |    2 +-
>>   xen/arch/x86/mm/shadow/common.c |    2 +-
>>   xen/common/Kconfig              |    3 +
>>   xen/common/Makefile             |    1 +
>>   xen/common/hvm/Makefile         |    1 +
>>   xen/common/hvm/ioreq.c          | 1430 
>> ++++++++++++++++++++++++++++++++++++++
>>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>>   xen/include/xen/hvm/ioreq.h     |   89 +++
>>   18 files changed, 1575 insertions(+), 1450 deletions(-)
>
> That's quite a lot of code moved in a single patch. How can we check 
> the code moved is still correct? Is it a verbatim copy?
In this patch I mostly tried to separate a common IOREQ part which also 
resulted in updating x86 sources to include new header.  Also I moved 
hvm_ioreq_needs_completion() to common header (which probably wanted to 
be in a separate patch). It was a verbatim copy initially (w/o 
hvm_map_mem_type_to_ioreq_server) and then updated to deal with arch 
specific parts.
In which way do you want me to split this patch?

I could think of the following:

1. Copy of x86's ioreq.c/ioreq.h to common code
2. Update common ioreq.c/ioreq.h
3. Update x86's parts to be able to deal with common code
4. Move hvm_ioreq_needs_completion() to common code
Stefano Stabellini Aug. 6, 2020, 8:32 p.m. UTC | #19
On Thu, 6 Aug 2020, Jan Beulich wrote:
> On 06.08.2020 02:37, Stefano Stabellini wrote:
> > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > early and let the scheduler do its job? Something like:
> > 
> >             enum io_state state = try_handle_mmio(regs, hsr, gpa);
> > 
> >             switch ( state )
> >             {
> >             case IO_ABORT:
> >                 goto inject_abt;
> >             case IO_HANDLED:
> >                 advance_pc(regs, hsr);
> >                 return;
> >             case IO_RETRY:
> >                 /* finish later */
> >                 return;
> >             case IO_UNHANDLED:
> >                 /* IO unhandled, try another way to handle it. */
> >                 break;
> >             default:
> >                 ASSERT_UNREACHABLE();
> >             }
> > 
> > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > handle_mmio just sets the user register with the read value.
> > 
> > But it would be better if it called again the original function
> > do_trap_stage2_abort_guest to actually retry the original operation.
> > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
> > counter) completing the handling of this instruction.
> > 
> > The user register with the read value could be set by try_handle_mmio if
> > try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
> > 
> > Is that how the state machine is expected to work?
> 
> I think so. Just because it has taken us quite some time (years) on
> the x86 side to get reasonably close to how hardware would behave
> (I think we're still not fully there): The re-execution path needs
> to make sure it observes exactly the same machine state as the
> original path did. In particular changes to memory (by another vCPU)
> must not be observed.

Thanks for the heads up. I think I understand how it is supposed to work
now. I hope Oleksandr is on the same page.
Stefano Stabellini Aug. 6, 2020, 11:48 p.m. UTC | #20
On Thu, 6 Aug 2020, Julien Grall wrote:
> On 06/08/2020 01:37, Stefano Stabellini wrote:
> > On Wed, 5 Aug 2020, Julien Grall wrote:
> > > On 04/08/2020 20:11, Stefano Stabellini wrote:
> > > > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t
> > > > > > > > *ioreq)
> > > > > > > > +{
> > > > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > > > +           !ioreq->data_is_ptr &&
> > > > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > > > > > IOREQ_WRITE);
> > > > > > > > +}
> > > > > > > I don't think having this in common code is correct. The short-cut
> > > > > > > of
> > > > > > > not
> > > > > > > completing PIO reads seems somewhat x86 specific.
> > > > > 
> > > > > Hmmm, looking at the code, I think it doesn't wait for PIO writes to
> > > > > complete
> > > > > (not read). Did I miss anything?
> > > > > 
> > > > > > Does ARM even
> > > > > > > have the concept of PIO?
> > > > > > 
> > > > > > I am not 100% sure here, but it seems that doesn't have.
> > > > > 
> > > > > Technically, the PIOs exist on Arm, however they are accessed the same
> > > > > way
> > > > > as
> > > > > MMIO and will have a dedicated area defined by the HW.
> > > > > 
> > > > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > > > > 
> > > > > Now the question is whether we want to expose them to the Device
> > > > > Emulator
> > > > > as
> > > > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to care
> > > > > about
> > > > > the
> > > > > architecture used. It should just be able to request a given IOport
> > > > > region.
> > > > > 
> > > > > So it may make sense to differentiate them in the common ioreq code as
> > > > > well.
> > > > > 
> > > > > I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
> > > > > address
> > > > > space are different on Arm as well. Paul, Stefano, do you know what
> > > > > they
> > > > > are
> > > > > doing?
> > > > 
> > > > On the QEMU side, it looks like PIO (address_space_io) is used in
> > > > connection with the emulation of the "in" or "out" instructions, see
> > > > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> > > > space regardless of the architecture, such as
> > > > hw/pci/pci_bridge.c:pci_bridge_initfn.
> > > > 
> > > > However, because there is no "in" and "out" on ARM, I don't think
> > > > address_space_io can be accessed. Specifically, there is no equivalent
> > > > for target/i386/misc_helper.c:helper_inb on ARM.
> > > 
> > > So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?
> > 
> > PIO is also memory mapped on ARM and it seems to have its own MMIO
> > address window.
> This part is already well-understood :). However, this only tell us how an OS
> is accessing a PIO.
> 
> What I am trying to figure out is how the hardware (or QEMU) is meant to work.
> 
> From my understanding, the MMIO access will be received by the hostbridge and
> then forwarded to the appropriate PCI device. The two questions I am trying to
> answer is: How the I/O BARs are configured? Will it contain an MMIO address or
> an offset?
> 
> If the answer is the latter, then we will need PIO because a DM will never see
> the MMIO address (the hostbridge will be emulated in Xen).

Now I understand the question :-)

This is the way I understand it works. Let's say that the PIO aperture
is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
0x1000-0x2000 are addresses that cannot be accessed directly.
0x3eff0000-0x3eff1000 is the range that works.

A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
for instance 0x1100.

However, when the operating system access 0x1100, it will issue a read
to 0x3eff0100.

Xen will trap the read to 0x3eff0100 and send it to QEMU.

QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
QEMU has also to know the address range of the MMIO aperture and its
remappings, if any (it is possible to have address remapping for MMIO
addresses too.)

I think today this information is "built-in" QEMU, not configurable. It
works fine because *I think* the PCI aperture is pretty much the same on
x86 boards, at least the one supported by QEMU for Xen.

On ARM, I think we should explicitly declare the PCI MMIO aperture and
its alias/address-remapping. When we do that, we can also declare the
PIO aperture and its alias/address-remapping.
Oleksandr Tyshchenko Aug. 7, 2020, 1:19 p.m. UTC | #21
On 06.08.20 23:32, Stefano Stabellini wrote:

Hi Stefano

> On Thu, 6 Aug 2020, Jan Beulich wrote:
>> On 06.08.2020 02:37, Stefano Stabellini wrote:
>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>> early and let the scheduler do its job? Something like:
>>>
>>>              enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>
>>>              switch ( state )
>>>              {
>>>              case IO_ABORT:
>>>                  goto inject_abt;
>>>              case IO_HANDLED:
>>>                  advance_pc(regs, hsr);
>>>                  return;
>>>              case IO_RETRY:
>>>                  /* finish later */
>>>                  return;
>>>              case IO_UNHANDLED:
>>>                  /* IO unhandled, try another way to handle it. */
>>>                  break;
>>>              default:
>>>                  ASSERT_UNREACHABLE();
>>>              }
>>>
>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>> handle_mmio just sets the user register with the read value.
>>>
>>> But it would be better if it called again the original function
>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>> IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
>>> counter) completing the handling of this instruction.
>>>
>>> The user register with the read value could be set by try_handle_mmio if
>>> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
>>>
>>> Is that how the state machine is expected to work?
>> I think so. Just because it has taken us quite some time (years) on
>> the x86 side to get reasonably close to how hardware would behave
>> (I think we're still not fully there): The re-execution path needs
>> to make sure it observes exactly the same machine state as the
>> original path did. In particular changes to memory (by another vCPU)
>> must not be observed.
> Thanks for the heads up. I think I understand how it is supposed to work
> now. I hope Oleksandr is on the same page.

Not completely. I am still going through the discussion and navigating 
the code trying to understand missing bits for me.

Thanks for trying to clarify how it supposed to work.
Oleksandr Tyshchenko Aug. 7, 2020, 4:45 p.m. UTC | #22
On 06.08.20 03:37, Stefano Stabellini wrote:

Hi Stefano

Trying to simulate IO_RETRY handling mechanism (according to model 
below) I continuously get IO_RETRY from try_fwd_ioserv() ...

> OK, thanks for the details. My interpretation seems to be correct.
>
> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> also needs to handle try_handle_mmio returning IO_RETRY the first
> around, and IO_HANDLED when after QEMU does its job.
>
> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> early and let the scheduler do its job? Something like:
>
>              enum io_state state = try_handle_mmio(regs, hsr, gpa);
>
>              switch ( state )
>              {
>              case IO_ABORT:
>                  goto inject_abt;
>              case IO_HANDLED:
>                  advance_pc(regs, hsr);
>                  return;
>              case IO_RETRY:
>                  /* finish later */
>                  return;
>              case IO_UNHANDLED:
>                  /* IO unhandled, try another way to handle it. */
>                  break;
>              default:
>                  ASSERT_UNREACHABLE();
>              }
>
> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> handle_hvm_io_completion() after QEMU completes the emulation. Today,
> handle_mmio just sets the user register with the read value.
>
> But it would be better if it called again the original function
> do_trap_stage2_abort_guest to actually retry the original operation.
> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> IO_HANDLED instead of IO_RETRY,
I may miss some important point, but I failed to see why try_handle_mmio 
(try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
Or current try_fwd_ioserv() logic needs rework?


> thus, it will advance_pc (the program
> counter) completing the handling of this instruction.
>
> The user register with the read value could be set by try_handle_mmio if
> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
>
> Is that how the state machine is expected to work?
Stefano Stabellini Aug. 7, 2020, 9:50 p.m. UTC | #23
On Fri, 7 Aug 2020, Oleksandr wrote:
> On 06.08.20 03:37, Stefano Stabellini wrote:
> 
> Hi Stefano
> 
> Trying to simulate IO_RETRY handling mechanism (according to model below) I
> continuously get IO_RETRY from try_fwd_ioserv() ...
> 
> > OK, thanks for the details. My interpretation seems to be correct.
> > 
> > In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> > return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > also needs to handle try_handle_mmio returning IO_RETRY the first
> > around, and IO_HANDLED when after QEMU does its job.
> > 
> > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > early and let the scheduler do its job? Something like:
> > 
> >              enum io_state state = try_handle_mmio(regs, hsr, gpa);
> > 
> >              switch ( state )
> >              {
> >              case IO_ABORT:
> >                  goto inject_abt;
> >              case IO_HANDLED:
> >                  advance_pc(regs, hsr);
> >                  return;
> >              case IO_RETRY:
> >                  /* finish later */
> >                  return;
> >              case IO_UNHANDLED:
> >                  /* IO unhandled, try another way to handle it. */
> >                  break;
> >              default:
> >                  ASSERT_UNREACHABLE();
> >              }
> > 
> > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > handle_mmio just sets the user register with the read value.
> > 
> > But it would be better if it called again the original function
> > do_trap_stage2_abort_guest to actually retry the original operation.
> > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > IO_HANDLED instead of IO_RETRY,
> I may miss some important point, but I failed to see why try_handle_mmio
> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
> Or current try_fwd_ioserv() logic needs rework?

I think you should check the ioreq->state in try_fwd_ioserv(), if the
result is ready, then ioreq->state should be STATE_IORESP_READY, and you
can return IO_HANDLED.

That is assuming that you are looking at the live version of the ioreq
shared with QEMU instead of a private copy of it, which I am not sure.
Looking at try_fwd_ioserv() it would seem that vio->io_req is just a
copy? The live version is returned by get_ioreq() ?

Even in handle_hvm_io_completion, instead of setting vio->io_req.state
to STATE_IORESP_READY by hand, it would be better to look at the live
version of the ioreq because QEMU will have already set ioreq->state
to STATE_IORESP_READY (hw/i386/xen/xen-hvm.c:cpu_handle_ioreq).

 
> > thus, it will advance_pc (the program
> > counter) completing the handling of this instruction.
> > 
> > The user register with the read value could be set by try_handle_mmio if
> > try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
> > 
> > Is that how the state machine is expected to work?
Oleksandr Tyshchenko Aug. 7, 2020, 10:19 p.m. UTC | #24
On 08.08.20 00:50, Stefano Stabellini wrote:

Hi Stefano

> On Fri, 7 Aug 2020, Oleksandr wrote:
>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>
>> Hi Stefano
>>
>> Trying to simulate IO_RETRY handling mechanism (according to model below) I
>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>
>>> OK, thanks for the details. My interpretation seems to be correct.
>>>
>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
>>> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>> around, and IO_HANDLED when after QEMU does its job.
>>>
>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>> early and let the scheduler do its job? Something like:
>>>
>>>               enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>
>>>               switch ( state )
>>>               {
>>>               case IO_ABORT:
>>>                   goto inject_abt;
>>>               case IO_HANDLED:
>>>                   advance_pc(regs, hsr);
>>>                   return;
>>>               case IO_RETRY:
>>>                   /* finish later */
>>>                   return;
>>>               case IO_UNHANDLED:
>>>                   /* IO unhandled, try another way to handle it. */
>>>                   break;
>>>               default:
>>>                   ASSERT_UNREACHABLE();
>>>               }
>>>
>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>> handle_mmio just sets the user register with the read value.
>>>
>>> But it would be better if it called again the original function
>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>> IO_HANDLED instead of IO_RETRY,
>> I may miss some important point, but I failed to see why try_handle_mmio
>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
>> Or current try_fwd_ioserv() logic needs rework?
> I think you should check the ioreq->state in try_fwd_ioserv(), if the
> result is ready, then ioreq->state should be STATE_IORESP_READY, and you
> can return IO_HANDLED.

Indeed! Just coming to this opinion I saw your answer)

This is a dirty test patch:


---
  xen/arch/arm/io.c               | 12 ++++++++++++
  xen/arch/arm/ioreq.c            | 12 ++++++++++++
  xen/arch/arm/traps.c            |  6 ++++--
  xen/include/asm-arm/hvm/ioreq.h |  2 ++
  xen/include/asm-arm/traps.h     |  3 +++
  5 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 436f669..65a08f8 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -130,6 +130,10 @@ static enum io_state try_fwd_ioserv(struct 
cpu_user_regs *regs,
      {
      case STATE_IOREQ_NONE:
          break;
+
+    case STATE_IORESP_READY:
+        return IO_HANDLED;
+
      default:
          printk("d%u wrong state %u\n", v->domain->domain_id,
                 vio->io_req.state);
@@ -156,9 +160,11 @@ static enum io_state try_fwd_ioserv(struct 
cpu_user_regs *regs,
      else
          vio->io_completion = HVMIO_mmio_completion;

+#if 0
      /* XXX: Decide what to do */
      if ( rc == IO_RETRY )
          rc = IO_HANDLED;
+#endif

      return rc;
  }
@@ -185,6 +191,12 @@ enum io_state try_handle_mmio(struct cpu_user_regs 
*regs,

  #ifdef CONFIG_IOREQ_SERVER
          rc = try_fwd_ioserv(regs, v, &info);
+        if ( rc == IO_HANDLED )
+        {
+            printk("HANDLED %s[%d]\n", __func__, __LINE__);
+            handle_mmio_finish();
+        } else
+            printk("RETRY %s[%d]\n", __func__, __LINE__);
  #endif

          return rc;
diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
index 8f60c41..c8ed454 100644
--- a/xen/arch/arm/ioreq.c
+++ b/xen/arch/arm/ioreq.c
@@ -33,8 +33,20 @@
  #include <public/hvm/dm_op.h>
  #include <public/hvm/ioreq.h>

+#include <asm/traps.h>
+
  bool handle_mmio(void)
  {
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    const union hsr hsr = { .bits = regs->hsr };
+
+    do_trap_stage2_abort_guest(regs, hsr);
+
+    return true;
+}
+
+bool handle_mmio_finish(void)
+{
      struct vcpu *v = current;
      struct cpu_user_regs *regs = guest_cpu_user_regs();
      const union hsr hsr = { .bits = regs->hsr };
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index ea472d1..3493d77 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1882,7 +1882,7 @@ static bool try_map_mmio(gfn_t gfn)
      return !map_regions_p2mt(d, gfn, 1, mfn, p2m_mmio_direct_c);
  }

-static void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
+void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
                                         const union hsr hsr)
  {
      /*
@@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct 
cpu_user_regs *regs,
              case IO_HANDLED:
                  advance_pc(regs, hsr);
                  return;
+            case IO_RETRY:
+                /* finish later */
+                return;
              case IO_UNHANDLED:
                  /* IO unhandled, try another way to handle it. */
                  break;
              default:
-                /* XXX: Handle IO_RETRY */
                  ASSERT_UNREACHABLE();
              }
          }
diff --git a/xen/include/asm-arm/hvm/ioreq.h 
b/xen/include/asm-arm/hvm/ioreq.h
index 392ce64..fb4684d 100644
--- a/xen/include/asm-arm/hvm/ioreq.h
+++ b/xen/include/asm-arm/hvm/ioreq.h
@@ -27,6 +27,8 @@

  bool handle_mmio(void);

+bool handle_mmio_finish(void);
+
  static inline bool handle_pio(uint16_t port, unsigned int size, int dir)
  {
      /* XXX */
diff --git a/xen/include/asm-arm/traps.h b/xen/include/asm-arm/traps.h
index 997c378..392fdb1 100644
--- a/xen/include/asm-arm/traps.h
+++ b/xen/include/asm-arm/traps.h
@@ -40,6 +40,9 @@ void advance_pc(struct cpu_user_regs *regs, const 
union hsr hsr);

  void inject_undef_exception(struct cpu_user_regs *regs, const union 
hsr hsr);

+void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
+                                        const union hsr hsr);
+
  /* read as zero and write ignore */
  void handle_raz_wi(struct cpu_user_regs *regs, int regidx, bool read,
                     const union hsr hsr, int min_el);
Oleksandr Tyshchenko Aug. 7, 2020, 11:45 p.m. UTC | #25
On 08.08.20 00:50, Stefano Stabellini wrote:

Hi

> On Fri, 7 Aug 2020, Oleksandr wrote:
>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>
>> Hi Stefano
>>
>> Trying to simulate IO_RETRY handling mechanism (according to model below) I
>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>
>>> OK, thanks for the details. My interpretation seems to be correct.
>>>
>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
>>> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>> around, and IO_HANDLED when after QEMU does its job.
>>>
>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>> early and let the scheduler do its job? Something like:
>>>
>>>               enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>
>>>               switch ( state )
>>>               {
>>>               case IO_ABORT:
>>>                   goto inject_abt;
>>>               case IO_HANDLED:
>>>                   advance_pc(regs, hsr);
>>>                   return;
>>>               case IO_RETRY:
>>>                   /* finish later */
>>>                   return;
>>>               case IO_UNHANDLED:
>>>                   /* IO unhandled, try another way to handle it. */
>>>                   break;
>>>               default:
>>>                   ASSERT_UNREACHABLE();
>>>               }
>>>
>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>> handle_mmio just sets the user register with the read value.
>>>
>>> But it would be better if it called again the original function
>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>> IO_HANDLED instead of IO_RETRY,
>> I may miss some important point, but I failed to see why try_handle_mmio
>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
>> Or current try_fwd_ioserv() logic needs rework?
> I think you should check the ioreq->state in try_fwd_ioserv(), if the
> result is ready, then ioreq->state should be STATE_IORESP_READY, and you
> can return IO_HANDLED.
>
> That is assuming that you are looking at the live version of the ioreq
> shared with QEMU instead of a private copy of it, which I am not sure.
> Looking at try_fwd_ioserv() it would seem that vio->io_req is just a
> copy? The live version is returned by get_ioreq() ?

If I understand the code correctly, indeed, get_ioreq() returns live 
version shared with emulator.
Desired state change (STATE_IORESP_READY) what actually the 
hvm_wait_for_io() is waiting for is set here (in my case):
https://xenbits.xen.org/gitweb/?p=people/pauldu/demu.git;a=blob;f=demu.c;h=f785b394d0cf141dffa05bdddecf338214358aea;hb=refs/heads/master#l698 


> Even in handle_hvm_io_completion, instead of setting vio->io_req.state
> to STATE_IORESP_READY by hand, it would be better to look at the live
> version of the ioreq because QEMU will have already set ioreq->state
> to STATE_IORESP_READY (hw/i386/xen/xen-hvm.c:cpu_handle_ioreq).
It seems that after detecting STATE_IORESP_READY in hvm_wait_for_io() 
the state of live version is set to STATE_IOREQ_NONE immediately, so 
looking at the live version down the handle_hvm_io_completion()
or in try_fwd_ioserv() shows us nothing I am afraid.
Oleksandr Tyshchenko Aug. 10, 2020, 1:41 p.m. UTC | #26
Hi


On 08.08.20 01:19, Oleksandr wrote:
>
> On 08.08.20 00:50, Stefano Stabellini wrote:
>
> Hi Stefano
>
>> On Fri, 7 Aug 2020, Oleksandr wrote:
>>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>>
>>> Hi Stefano
>>>
>>> Trying to simulate IO_RETRY handling mechanism (according to model 
>>> below) I
>>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>>
>>>> OK, thanks for the details. My interpretation seems to be correct.
>>>>
>>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
>>>> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>>> around, and IO_HANDLED when after QEMU does its job.
>>>>
>>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>>> early and let the scheduler do its job? Something like:
>>>>
>>>>               enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>>
>>>>               switch ( state )
>>>>               {
>>>>               case IO_ABORT:
>>>>                   goto inject_abt;
>>>>               case IO_HANDLED:
>>>>                   advance_pc(regs, hsr);
>>>>                   return;
>>>>               case IO_RETRY:
>>>>                   /* finish later */
>>>>                   return;
>>>>               case IO_UNHANDLED:
>>>>                   /* IO unhandled, try another way to handle it. */
>>>>                   break;
>>>>               default:
>>>>                   ASSERT_UNREACHABLE();
>>>>               }
>>>>
>>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>>> handle_mmio just sets the user register with the read value.
>>>>
>>>> But it would be better if it called again the original function
>>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>>> IO_HANDLED instead of IO_RETRY,
>>> I may miss some important point, but I failed to see why 
>>> try_handle_mmio
>>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this 
>>> stage.
>>> Or current try_fwd_ioserv() logic needs rework?
>> I think you should check the ioreq->state in try_fwd_ioserv(), if the
>> result is ready, then ioreq->state should be STATE_IORESP_READY, and you
>> can return IO_HANDLED.
>

I optimized test patch a bit (now it looks much simpler). I didn't face 
any issues during a quick test.
Julien, Stefano, what do you think it does proper things for addressing 
TODO? Or I missed something?


---
  xen/arch/arm/io.c    | 4 ----
  xen/arch/arm/ioreq.c | 7 ++++++-
  xen/arch/arm/traps.c | 4 +++-
  3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 436f669..3063577 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -156,10 +156,6 @@ static enum io_state try_fwd_ioserv(struct 
cpu_user_regs *regs,
      else
          vio->io_completion = HVMIO_mmio_completion;

-    /* XXX: Decide what to do */
-    if ( rc == IO_RETRY )
-        rc = IO_HANDLED;
-
      return rc;
  }
  #endif
diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
index 8f60c41..e5235c6 100644
--- a/xen/arch/arm/ioreq.c
+++ b/xen/arch/arm/ioreq.c
@@ -33,6 +33,8 @@
  #include <public/hvm/dm_op.h>
  #include <public/hvm/ioreq.h>

+#include <asm/traps.h>
+
  bool handle_mmio(void)
  {
      struct vcpu *v = current;
@@ -52,7 +54,7 @@ bool handle_mmio(void)

      /* XXX: Do we need to take care of write here ? */
      if ( dabt.write )
-        return true;
+        goto done;

      /*
       * Sign extend if required.
@@ -72,6 +74,9 @@ bool handle_mmio(void)

      set_user_reg(regs, dabt.reg, r);

+done:
+    advance_pc(regs, hsr);
+
      return true;
  }

diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index ea472d1..974c744 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct 
cpu_user_regs *regs,
              case IO_HANDLED:
                  advance_pc(regs, hsr);
                  return;
+            case IO_RETRY:
+                /* finish later */
+                return;
              case IO_UNHANDLED:
                  /* IO unhandled, try another way to handle it. */
                  break;
              default:
-                /* XXX: Handle IO_RETRY */
                  ASSERT_UNREACHABLE();
              }
          }
Julien Grall Aug. 10, 2020, 4:29 p.m. UTC | #27
Hi,

On 06/08/2020 12:37, Oleksandr wrote:
> 
> On 05.08.20 16:30, Julien Grall wrote:
>> Hi,
> 
> Hi Julien
> 
> 
>>
>> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>
>>> As a lot of x86 code can be re-used on Arm later on, this patch
>>> splits IOREQ support into common and arch specific parts.
>>>
>>> This support is going to be used on Arm to be able run device
>>> emulator outside of Xen hypervisor.
>>>
>>> Please note, this is a split/cleanup of Julien's PoC:
>>> "Add support for Guest IO forwarding to a device emulator"
>>>
>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>> ---
>>>   xen/arch/x86/Kconfig            |    1 +
>>>   xen/arch/x86/hvm/dm.c           |    2 +-
>>>   xen/arch/x86/hvm/emulate.c      |    2 +-
>>>   xen/arch/x86/hvm/hvm.c          |    2 +-
>>>   xen/arch/x86/hvm/io.c           |    2 +-
>>>   xen/arch/x86/hvm/ioreq.c        | 1431 
>>> +--------------------------------------
>>>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>>>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>>>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>>>   xen/arch/x86/mm.c               |    2 +-
>>>   xen/arch/x86/mm/shadow/common.c |    2 +-
>>>   xen/common/Kconfig              |    3 +
>>>   xen/common/Makefile             |    1 +
>>>   xen/common/hvm/Makefile         |    1 +
>>>   xen/common/hvm/ioreq.c          | 1430 
>>> ++++++++++++++++++++++++++++++++++++++
>>>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>>>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>>>   xen/include/xen/hvm/ioreq.h     |   89 +++
>>>   18 files changed, 1575 insertions(+), 1450 deletions(-)
>>
>> That's quite a lot of code moved in a single patch. How can we check 
>> the code moved is still correct? Is it a verbatim copy?
> In this patch I mostly tried to separate a common IOREQ part which also 
> resulted in updating x86 sources to include new header.  Also I moved 
> hvm_ioreq_needs_completion() to common header (which probably wanted to 
> be in a separate patch). It was a verbatim copy initially (w/o 
> hvm_map_mem_type_to_ioreq_server) and then updated to deal with arch 
> specific parts.

I would prefer if the two parts are done separately. IOW, the code 
movement be nearly a verbatim copy.

> In which way do you want me to split this patch?
> 
> I could think of the following:
> 
> 1. Copy of x86's ioreq.c/ioreq.h to common code > 2. Update common ioreq.c/ioreq.h
> 3. Update x86's parts to be able to deal with common code
> 4. Move hvm_ioreq_needs_completion() to common code
> 

Ideally the code movement should be done in the same patch. This helps 
to check the patch is only moving code and also avoids mistakes on rebase.

So my preference would be to first modify the x86 code (e.g. renaming) 
to make it common and then have one patch that will move the code.

Cheers,
Oleksandr Tyshchenko Aug. 10, 2020, 5:28 p.m. UTC | #28
On 10.08.20 19:29, Julien Grall wrote:
> Hi,

Hi Julien


>
> On 06/08/2020 12:37, Oleksandr wrote:
>>
>> On 05.08.20 16:30, Julien Grall wrote:
>>> Hi,
>>
>> Hi Julien
>>
>>
>>>
>>> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>
>>>> As a lot of x86 code can be re-used on Arm later on, this patch
>>>> splits IOREQ support into common and arch specific parts.
>>>>
>>>> This support is going to be used on Arm to be able run device
>>>> emulator outside of Xen hypervisor.
>>>>
>>>> Please note, this is a split/cleanup of Julien's PoC:
>>>> "Add support for Guest IO forwarding to a device emulator"
>>>>
>>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>> ---
>>>>   xen/arch/x86/Kconfig            |    1 +
>>>>   xen/arch/x86/hvm/dm.c           |    2 +-
>>>>   xen/arch/x86/hvm/emulate.c      |    2 +-
>>>>   xen/arch/x86/hvm/hvm.c          |    2 +-
>>>>   xen/arch/x86/hvm/io.c           |    2 +-
>>>>   xen/arch/x86/hvm/ioreq.c        | 1431 
>>>> +--------------------------------------
>>>>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>>>>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>>>>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>>>>   xen/arch/x86/mm.c               |    2 +-
>>>>   xen/arch/x86/mm/shadow/common.c |    2 +-
>>>>   xen/common/Kconfig              |    3 +
>>>>   xen/common/Makefile             |    1 +
>>>>   xen/common/hvm/Makefile         |    1 +
>>>>   xen/common/hvm/ioreq.c          | 1430 
>>>> ++++++++++++++++++++++++++++++++++++++
>>>>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>>>>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>>>>   xen/include/xen/hvm/ioreq.h     |   89 +++
>>>>   18 files changed, 1575 insertions(+), 1450 deletions(-)
>>>
>>> That's quite a lot of code moved in a single patch. How can we check 
>>> the code moved is still correct? Is it a verbatim copy?
>> In this patch I mostly tried to separate a common IOREQ part which 
>> also resulted in updating x86 sources to include new header.  Also I 
>> moved hvm_ioreq_needs_completion() to common header (which probably 
>> wanted to be in a separate patch). It was a verbatim copy initially 
>> (w/o hvm_map_mem_type_to_ioreq_server) and then updated to deal with 
>> arch specific parts.
>
> I would prefer if the two parts are done separately. IOW, the code 
> movement be nearly a verbatim copy.
>
>> In which way do you want me to split this patch?
>>
>> I could think of the following:
>>
>> 1. Copy of x86's ioreq.c/ioreq.h to common code > 2. Update common 
>> ioreq.c/ioreq.h
>> 3. Update x86's parts to be able to deal with common code
>> 4. Move hvm_ioreq_needs_completion() to common code
>>
>
> Ideally the code movement should be done in the same patch. This helps 
> to check the patch is only moving code and also avoids mistakes on 
> rebase.
>
> So my preference would be to first modify the x86 code (e.g. renaming) 
> to make it common and then have one patch that will move the code.

ok, will try to split accordingly. Thank you
Julien Grall Aug. 10, 2020, 7:20 p.m. UTC | #29
On 07/08/2020 00:48, Stefano Stabellini wrote:
> On Thu, 6 Aug 2020, Julien Grall wrote:
>> On 06/08/2020 01:37, Stefano Stabellini wrote:
>>> On Wed, 5 Aug 2020, Julien Grall wrote:
>>>> On 04/08/2020 20:11, Stefano Stabellini wrote:
>>>>> On Tue, 4 Aug 2020, Julien Grall wrote:
>>>>>> On 04/08/2020 12:10, Oleksandr wrote:
>>>>>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t
>>>>>>>>> *ioreq)
>>>>>>>>> +{
>>>>>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>>>>>> +           !ioreq->data_is_ptr &&
>>>>>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>>>>>> IOREQ_WRITE);
>>>>>>>>> +}
>>>>>>>> I don't think having this in common code is correct. The short-cut
>>>>>>>> of
>>>>>>>> not
>>>>>>>> completing PIO reads seems somewhat x86 specific.
>>>>>>
>>>>>> Hmmm, looking at the code, I think it doesn't wait for PIO writes to
>>>>>> complete
>>>>>> (not read). Did I miss anything?
>>>>>>
>>>>>>> Does ARM even
>>>>>>>> have the concept of PIO?
>>>>>>>
>>>>>>> I am not 100% sure here, but it seems that doesn't have.
>>>>>>
>>>>>> Technically, the PIOs exist on Arm, however they are accessed the same
>>>>>> way
>>>>>> as
>>>>>> MMIO and will have a dedicated area defined by the HW.
>>>>>>
>>>>>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>>>>>
>>>>>> Now the question is whether we want to expose them to the Device
>>>>>> Emulator
>>>>>> as
>>>>>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care
>>>>>> about
>>>>>> the
>>>>>> architecture used. It should just be able to request a given IOport
>>>>>> region.
>>>>>>
>>>>>> So it may make sense to differentiate them in the common ioreq code as
>>>>>> well.
>>>>>>
>>>>>> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
>>>>>> address
>>>>>> space are different on Arm as well. Paul, Stefano, do you know what
>>>>>> they
>>>>>> are
>>>>>> doing?
>>>>>
>>>>> On the QEMU side, it looks like PIO (address_space_io) is used in
>>>>> connection with the emulation of the "in" or "out" instructions, see
>>>>> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
>>>>> space regardless of the architecture, such as
>>>>> hw/pci/pci_bridge.c:pci_bridge_initfn.
>>>>>
>>>>> However, because there is no "in" and "out" on ARM, I don't think
>>>>> address_space_io can be accessed. Specifically, there is no equivalent
>>>>> for target/i386/misc_helper.c:helper_inb on ARM.
>>>>
>>>> So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?
>>>
>>> PIO is also memory mapped on ARM and it seems to have its own MMIO
>>> address window.
>> This part is already well-understood :). However, this only tell us how an OS
>> is accessing a PIO.
>>
>> What I am trying to figure out is how the hardware (or QEMU) is meant to work.
>>
>>  From my understanding, the MMIO access will be received by the hostbridge and
>> then forwarded to the appropriate PCI device. The two questions I am trying to
>> answer is: How the I/O BARs are configured? Will it contain an MMIO address or
>> an offset?
>>
>> If the answer is the latter, then we will need PIO because a DM will never see
>> the MMIO address (the hostbridge will be emulated in Xen).
> 
> Now I understand the question :-)
> 
> This is the way I understand it works. Let's say that the PIO aperture
> is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
> 0x1000-0x2000 are addresses that cannot be accessed directly.
> 0x3eff0000-0x3eff1000 is the range that works.
> 
> A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
> for instance 0x1100.
> 
> However, when the operating system access 0x1100, it will issue a read
> to 0x3eff0100.
> 
> Xen will trap the read to 0x3eff0100 and send it to QEMU.
> 
> QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
> aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
> QEMU has also to know the address range of the MMIO aperture and its
> remappings, if any (it is possible to have address remapping for MMIO
> addresses too.)
> 
> I think today this information is "built-in" QEMU, not configurable. It
> works fine because *I think* the PCI aperture is pretty much the same on
> x86 boards, at least the one supported by QEMU for Xen.

Well on x86, the OS will access PIO using inb/outb. So the address 
received by Xen is 0x1000-0x2000 and then forwarded to the DM using the 
PIO type.

> 
> On ARM, I think we should explicitly declare the PCI MMIO aperture and
> its alias/address-remapping. When we do that, we can also declare the
> PIO aperture and its alias/address-remapping.

Well yes, we need to define PCI MMIO and PCI I/O region because the 
guest OS needs to know them.

However, I am unsure how this would help us to solve the question 
whether access to the PCI I/O aperture should be sent as a PIO or MMIO.

Per what you wrote, the PCI I/O Bar would be configured with the range 
0x1000-0x2000. So a device emulator (this may not be QEMU and only 
emulate one PCI device!!) will only see that range.

How does the device-emulator then know that it needs to watch the region 
0x3eff0000-0x3eff1000?

It feels to me that it would be easier/make more sense if the DM only 
say "I want to watch the PIO range 0x1000-0x2000". So Xen would be in 
charge to do the translation between the OS view and the DM view.

This also means a DM would be completely arch-agnostic. This would 
follow the HW where you can plug your PCI card on any HW.

Cheers,
Stefano Stabellini Aug. 10, 2020, 11:34 p.m. UTC | #30
On Sat, 8 Aug 2020, Oleksandr wrote:
> On 08.08.20 00:50, Stefano Stabellini wrote:
> > On Fri, 7 Aug 2020, Oleksandr wrote:
> > > On 06.08.20 03:37, Stefano Stabellini wrote:
> > > 
> > > Hi Stefano
> > > 
> > > Trying to simulate IO_RETRY handling mechanism (according to model below)
> > > I
> > > continuously get IO_RETRY from try_fwd_ioserv() ...
> > > 
> > > > OK, thanks for the details. My interpretation seems to be correct.
> > > > 
> > > > In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> > > > return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > > > also needs to handle try_handle_mmio returning IO_RETRY the first
> > > > around, and IO_HANDLED when after QEMU does its job.
> > > > 
> > > > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > > > early and let the scheduler do its job? Something like:
> > > > 
> > > >               enum io_state state = try_handle_mmio(regs, hsr, gpa);
> > > > 
> > > >               switch ( state )
> > > >               {
> > > >               case IO_ABORT:
> > > >                   goto inject_abt;
> > > >               case IO_HANDLED:
> > > >                   advance_pc(regs, hsr);
> > > >                   return;
> > > >               case IO_RETRY:
> > > >                   /* finish later */
> > > >                   return;
> > > >               case IO_UNHANDLED:
> > > >                   /* IO unhandled, try another way to handle it. */
> > > >                   break;
> > > >               default:
> > > >                   ASSERT_UNREACHABLE();
> > > >               }
> > > > 
> > > > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > > > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > > > handle_mmio just sets the user register with the read value.
> > > > 
> > > > But it would be better if it called again the original function
> > > > do_trap_stage2_abort_guest to actually retry the original operation.
> > > > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > > > IO_HANDLED instead of IO_RETRY,
> > > I may miss some important point, but I failed to see why try_handle_mmio
> > > (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
> > > Or current try_fwd_ioserv() logic needs rework?
> > I think you should check the ioreq->state in try_fwd_ioserv(), if the
> > result is ready, then ioreq->state should be STATE_IORESP_READY, and you
> > can return IO_HANDLED.
> > 
> > That is assuming that you are looking at the live version of the ioreq
> > shared with QEMU instead of a private copy of it, which I am not sure.
> > Looking at try_fwd_ioserv() it would seem that vio->io_req is just a
> > copy? The live version is returned by get_ioreq() ?
> 
> If I understand the code correctly, indeed, get_ioreq() returns live version
> shared with emulator.
> Desired state change (STATE_IORESP_READY) what actually the hvm_wait_for_io()
> is waiting for is set here (in my case):
> https://xenbits.xen.org/gitweb/?p=people/pauldu/demu.git;a=blob;f=demu.c;h=f785b394d0cf141dffa05bdddecf338214358aea;hb=refs/heads/master#l698 
> 
> > Even in handle_hvm_io_completion, instead of setting vio->io_req.state
> > to STATE_IORESP_READY by hand, it would be better to look at the live
> > version of the ioreq because QEMU will have already set ioreq->state
> > to STATE_IORESP_READY (hw/i386/xen/xen-hvm.c:cpu_handle_ioreq).
> It seems that after detecting STATE_IORESP_READY in hvm_wait_for_io() the
> state of live version is set to STATE_IOREQ_NONE immediately, so looking at
> the live version down the handle_hvm_io_completion()
> or in try_fwd_ioserv() shows us nothing I am afraid.

I see. That is because we want to "free" the ioreq as soon as possible,
which is good. handle_hvm_io_completion also sets vio->io_req.state to
STATE_IORESP_READY, so our private copy is still set to
STATE_IORESP_READY. Thus, try_fwd_ioserv should do the right thing
simply reading vio->io_req.state: try_fwd_ioserv should be able to
return IO_HANDLED when the "state" is STATE_IORESP_READY, right?
Stefano Stabellini Aug. 10, 2020, 11:34 p.m. UTC | #31
On Mon, 10 Aug 2020, Oleksandr wrote:
> On 08.08.20 01:19, Oleksandr wrote:
> > On 08.08.20 00:50, Stefano Stabellini wrote:
> > > On Fri, 7 Aug 2020, Oleksandr wrote:
> > > > On 06.08.20 03:37, Stefano Stabellini wrote:
> > > > 
> > > > Hi Stefano
> > > > 
> > > > Trying to simulate IO_RETRY handling mechanism (according to model
> > > > below) I
> > > > continuously get IO_RETRY from try_fwd_ioserv() ...
> > > > 
> > > > > OK, thanks for the details. My interpretation seems to be correct.
> > > > > 
> > > > > In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> > > > > return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > > > > also needs to handle try_handle_mmio returning IO_RETRY the first
> > > > > around, and IO_HANDLED when after QEMU does its job.
> > > > > 
> > > > > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > > > > early and let the scheduler do its job? Something like:
> > > > > 
> > > > >               enum io_state state = try_handle_mmio(regs, hsr, gpa);
> > > > > 
> > > > >               switch ( state )
> > > > >               {
> > > > >               case IO_ABORT:
> > > > >                   goto inject_abt;
> > > > >               case IO_HANDLED:
> > > > >                   advance_pc(regs, hsr);
> > > > >                   return;
> > > > >               case IO_RETRY:
> > > > >                   /* finish later */
> > > > >                   return;
> > > > >               case IO_UNHANDLED:
> > > > >                   /* IO unhandled, try another way to handle it. */
> > > > >                   break;
> > > > >               default:
> > > > >                   ASSERT_UNREACHABLE();
> > > > >               }
> > > > > 
> > > > > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > > > > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > > > > handle_mmio just sets the user register with the read value.
> > > > > 
> > > > > But it would be better if it called again the original function
> > > > > do_trap_stage2_abort_guest to actually retry the original operation.
> > > > > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > > > > IO_HANDLED instead of IO_RETRY,
> > > > I may miss some important point, but I failed to see why try_handle_mmio
> > > > (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this
> > > > stage.
> > > > Or current try_fwd_ioserv() logic needs rework?
> > > I think you should check the ioreq->state in try_fwd_ioserv(), if the
> > > result is ready, then ioreq->state should be STATE_IORESP_READY, and you
> > > can return IO_HANDLED.
> > 
> 
> I optimized test patch a bit (now it looks much simpler). I didn't face any
> issues during a quick test.

Both patches get much closer to following the proper state machine,
great! I think this patch is certainly a good improvement. I think the
other patch you sent earlier, slightly larger, is even better. It makes
the following additional changes that would be good to have:

- try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
- handle_mmio simply calls do_trap_stage2_abort_guest

I would also remove "handle_mmio_finish" and do the guest register
setting as well as setting vio->io_req.state to STATE_IOREQ_NONE
directly in try_fwd_ioserv.

 
> ---
>  xen/arch/arm/io.c    | 4 ----
>  xen/arch/arm/ioreq.c | 7 ++++++-
>  xen/arch/arm/traps.c | 4 +++-
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 436f669..3063577 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -156,10 +156,6 @@ static enum io_state try_fwd_ioserv(struct cpu_user_regs
> *regs,
>      else
>          vio->io_completion = HVMIO_mmio_completion;
> 
> -    /* XXX: Decide what to do */
> -    if ( rc 7== IO_RETRY )
> -        rc = IO_HANDLED;
> -
>      return rc;
>  }
>  #endif
> diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
> index 8f60c41..e5235c6 100644
> --- a/xen/arch/arm/ioreq.c
> +++ b/xen/arch/arm/ioreq.c
> @@ -33,6 +33,8 @@
>  #include <public/hvm/dm_op.h>
>  #include <public/hvm/ioreq.h>
> 
> +#include <asm/traps.h>
> +
>  bool handle_mmio(void)
>  {
>      struct vcpu *v = current;
> @@ -52,7 +54,7 @@ bool handle_mmio(void)
> 
>      /* XXX: Do we need to take care of write here ? */
>      if ( dabt.write )
> -        return true;
> +        goto done;
> 
>      /*
>       * Sign extend if required.
> @@ -72,6 +74,9 @@ bool handle_mmio(void)
> 
>      set_user_reg(regs, dabt.reg, r);
> 
> +done:
> +    advance_pc(regs, hsr);
> +
>      return true;
>  }
> 
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index ea472d1..974c744 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct
> cpu_user_regs *regs,
>              case IO_HANDLED:
>                  advance_pc(regs, hsr);
>                  return;
> +            case IO_RETRY:
> +                /* finish later */
> +                return;
>              case IO_UNHANDLED:
>                  /* IO unhandled, try another way to handle it. */
>                  break;
>              default:
> -                /* XXX: Handle IO_RETRY */
>                  ASSERT_UNREACHABLE();
>              }
>          }
Stefano Stabellini Aug. 10, 2020, 11:34 p.m. UTC | #32
On Mon, 10 Aug 2020, Julien Grall wrote:
> On 07/08/2020 00:48, Stefano Stabellini wrote:
> > On Thu, 6 Aug 2020, Julien Grall wrote:
> > > On 06/08/2020 01:37, Stefano Stabellini wrote:
> > > > On Wed, 5 Aug 2020, Julien Grall wrote:
> > > > > On 04/08/2020 20:11, Stefano Stabellini wrote:
> > > > > > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > > > > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > > > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > > > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t
> > > > > > > > > > *ioreq)
> > > > > > > > > > +{
> > > > > > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > > > > > +           !ioreq->data_is_ptr &&
> > > > > > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > > > > > > > IOREQ_WRITE);
> > > > > > > > > > +}
> > > > > > > > > I don't think having this in common code is correct. The
> > > > > > > > > short-cut
> > > > > > > > > of
> > > > > > > > > not
> > > > > > > > > completing PIO reads seems somewhat x86 specific.
> > > > > > > 
> > > > > > > Hmmm, looking at the code, I think it doesn't wait for PIO writes
> > > > > > > to
> > > > > > > complete
> > > > > > > (not read). Did I miss anything?
> > > > > > > 
> > > > > > > > Does ARM even
> > > > > > > > > have the concept of PIO?
> > > > > > > > 
> > > > > > > > I am not 100% sure here, but it seems that doesn't have.
> > > > > > > 
> > > > > > > Technically, the PIOs exist on Arm, however they are accessed the
> > > > > > > same
> > > > > > > way
> > > > > > > as
> > > > > > > MMIO and will have a dedicated area defined by the HW.
> > > > > > > 
> > > > > > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > > > > > > 
> > > > > > > Now the question is whether we want to expose them to the Device
> > > > > > > Emulator
> > > > > > > as
> > > > > > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to
> > > > > > > care
> > > > > > > about
> > > > > > > the
> > > > > > > architecture used. It should just be able to request a given
> > > > > > > IOport
> > > > > > > region.
> > > > > > > 
> > > > > > > So it may make sense to differentiate them in the common ioreq
> > > > > > > code as
> > > > > > > well.
> > > > > > > 
> > > > > > > I had a quick look at QEMU and wasn't able to tell if PIOs and
> > > > > > > MMIOs
> > > > > > > address
> > > > > > > space are different on Arm as well. Paul, Stefano, do you know
> > > > > > > what
> > > > > > > they
> > > > > > > are
> > > > > > > doing?
> > > > > > 
> > > > > > On the QEMU side, it looks like PIO (address_space_io) is used in
> > > > > > connection with the emulation of the "in" or "out" instructions, see
> > > > > > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> > > > > > space regardless of the architecture, such as
> > > > > > hw/pci/pci_bridge.c:pci_bridge_initfn.
> > > > > > 
> > > > > > However, because there is no "in" and "out" on ARM, I don't think
> > > > > > address_space_io can be accessed. Specifically, there is no
> > > > > > equivalent
> > > > > > for target/i386/misc_helper.c:helper_inb on ARM.
> > > > > 
> > > > > So how PCI I/O BAR are accessed? Surely, they could be used on Arm,
> > > > > right?
> > > > 
> > > > PIO is also memory mapped on ARM and it seems to have its own MMIO
> > > > address window.
> > > This part is already well-understood :). However, this only tell us how an
> > > OS
> > > is accessing a PIO.
> > > 
> > > What I am trying to figure out is how the hardware (or QEMU) is meant to
> > > work.
> > > 
> > >  From my understanding, the MMIO access will be received by the hostbridge
> > > and
> > > then forwarded to the appropriate PCI device. The two questions I am
> > > trying to
> > > answer is: How the I/O BARs are configured? Will it contain an MMIO
> > > address or
> > > an offset?
> > > 
> > > If the answer is the latter, then we will need PIO because a DM will never
> > > see
> > > the MMIO address (the hostbridge will be emulated in Xen).
> > 
> > Now I understand the question :-)
> > 
> > This is the way I understand it works. Let's say that the PIO aperture
> > is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
> > 0x1000-0x2000 are addresses that cannot be accessed directly.
> > 0x3eff0000-0x3eff1000 is the range that works.
> > 
> > A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
> > for instance 0x1100.
> > 
> > However, when the operating system access 0x1100, it will issue a read
> > to 0x3eff0100.
> > 
> > Xen will trap the read to 0x3eff0100 and send it to QEMU.
> > 
> > QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
> > aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
> > QEMU has also to know the address range of the MMIO aperture and its
> > remappings, if any (it is possible to have address remapping for MMIO
> > addresses too.)
> > 
> > I think today this information is "built-in" QEMU, not configurable. It
> > works fine because *I think* the PCI aperture is pretty much the same on
> > x86 boards, at least the one supported by QEMU for Xen.
> 
> Well on x86, the OS will access PIO using inb/outb. So the address received by
> Xen is 0x1000-0x2000 and then forwarded to the DM using the PIO type.
> 
> > On ARM, I think we should explicitly declare the PCI MMIO aperture and
> > its alias/address-remapping. When we do that, we can also declare the
> > PIO aperture and its alias/address-remapping.
> 
> Well yes, we need to define PCI MMIO and PCI I/O region because the guest OS
> needs to know them.

[1]
(see below)


> However, I am unsure how this would help us to solve the question whether
> access to the PCI I/O aperture should be sent as a PIO or MMIO.
> 
> Per what you wrote, the PCI I/O Bar would be configured with the range
> 0x1000-0x2000. So a device emulator (this may not be QEMU and only emulate one
> PCI device!!) will only see that range.
> 
> How does the device-emulator then know that it needs to watch the region
> 0x3eff0000-0x3eff1000?

It would know because the PCI PIO aperture, together with the alias, are
specified [1].


> It feels to me that it would be easier/make more sense if the DM only say "I
> want to watch the PIO range 0x1000-0x2000". So Xen would be in charge to do
> the translation between the OS view and the DM view.
> 
> This also means a DM would be completely arch-agnostic. This would follow the
> HW where you can plug your PCI card on any HW.

As you know, PIO access is actually not modelled by QEMU for ARM
targets. I worry about the long term stability of it, given that it is
untested.  I.e. qemu-system-aarch64 could have a broken PIO emulation
and nobody would find out except for us when we send ioreqs to it.

Thinking from a Xen/Emulator interface on ARM, is it wise to rely on an
access-type that doesn't exist on the architecture?
Julien Grall Aug. 11, 2020, 9:19 a.m. UTC | #33
Hi Stefano,

On 11/08/2020 00:34, Stefano Stabellini wrote:
> On Mon, 10 Aug 2020, Oleksandr wrote:
>> On 08.08.20 01:19, Oleksandr wrote:
>>> On 08.08.20 00:50, Stefano Stabellini wrote:
>>>> On Fri, 7 Aug 2020, Oleksandr wrote:
>>>>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>>>>
>>>>> Hi Stefano
>>>>>
>>>>> Trying to simulate IO_RETRY handling mechanism (according to model
>>>>> below) I
>>>>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>>>>
>>>>>> OK, thanks for the details. My interpretation seems to be correct.
>>>>>>
>>>>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
>>>>>> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>>>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>>>>> around, and IO_HANDLED when after QEMU does its job.
>>>>>>
>>>>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>>>>> early and let the scheduler do its job? Something like:
>>>>>>
>>>>>>                enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>>>>
>>>>>>                switch ( state )
>>>>>>                {
>>>>>>                case IO_ABORT:
>>>>>>                    goto inject_abt;
>>>>>>                case IO_HANDLED:
>>>>>>                    advance_pc(regs, hsr);
>>>>>>                    return;
>>>>>>                case IO_RETRY:
>>>>>>                    /* finish later */
>>>>>>                    return;
>>>>>>                case IO_UNHANDLED:
>>>>>>                    /* IO unhandled, try another way to handle it. */
>>>>>>                    break;
>>>>>>                default:
>>>>>>                    ASSERT_UNREACHABLE();
>>>>>>                }
>>>>>>
>>>>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>>>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>>>>> handle_mmio just sets the user register with the read value.
>>>>>>
>>>>>> But it would be better if it called again the original function
>>>>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>>>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>>>>> IO_HANDLED instead of IO_RETRY,
>>>>> I may miss some important point, but I failed to see why try_handle_mmio
>>>>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this
>>>>> stage.
>>>>> Or current try_fwd_ioserv() logic needs rework?
>>>> I think you should check the ioreq->state in try_fwd_ioserv(), if the
>>>> result is ready, then ioreq->state should be STATE_IORESP_READY, and you
>>>> can return IO_HANDLED.
>>>
>>
>> I optimized test patch a bit (now it looks much simpler). I didn't face any
>> issues during a quick test.
> 
> Both patches get much closer to following the proper state machine,
> great! I think this patch is certainly a good improvement. I think the
> other patch you sent earlier, slightly larger, is even better. It makes
> the following additional changes that would be good to have:
> 
> - try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
> - handle_mmio simply calls do_trap_stage2_abort_guest

I don't think we should call do_trap_stage2_abort_guest() as part of the 
completion because:
     * The function do_trap_stage2_abort_guest() is using registers that 
are not context switched (such as FAR_EL2). I/O handling is split in two 
with likely a context switch in the middle. The second part is the 
completion (i.e call to handle_mmio()). So the system registers will be 
incorrect.
     * A big chunk of do_trap_stage2_abort_guest() is not necessary for 
the completion. For instance, there is no need to try to translate the 
guest virtual address to a guest physical address.

Therefore the version below is probably the best approach.

Cheers,
Oleksandr Tyshchenko Aug. 11, 2020, 10:10 a.m. UTC | #34
On 11.08.20 12:19, Julien Grall wrote:

Hi Julien, Stefano

> Hi Stefano,
>
> On 11/08/2020 00:34, Stefano Stabellini wrote:
>> On Mon, 10 Aug 2020, Oleksandr wrote:
>>> On 08.08.20 01:19, Oleksandr wrote:
>>>> On 08.08.20 00:50, Stefano Stabellini wrote:
>>>>> On Fri, 7 Aug 2020, Oleksandr wrote:
>>>>>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>>>>>
>>>>>> Hi Stefano
>>>>>>
>>>>>> Trying to simulate IO_RETRY handling mechanism (according to model
>>>>>> below) I
>>>>>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>>>>>
>>>>>>> OK, thanks for the details. My interpretation seems to be correct.
>>>>>>>
>>>>>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv 
>>>>>>> should
>>>>>>> return IO_RETRY. Then, 
>>>>>>> xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>>>>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>>>>>> around, and IO_HANDLED when after QEMU does its job.
>>>>>>>
>>>>>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply 
>>>>>>> return
>>>>>>> early and let the scheduler do its job? Something like:
>>>>>>>
>>>>>>>                enum io_state state = try_handle_mmio(regs, hsr, 
>>>>>>> gpa);
>>>>>>>
>>>>>>>                switch ( state )
>>>>>>>                {
>>>>>>>                case IO_ABORT:
>>>>>>>                    goto inject_abt;
>>>>>>>                case IO_HANDLED:
>>>>>>>                    advance_pc(regs, hsr);
>>>>>>>                    return;
>>>>>>>                case IO_RETRY:
>>>>>>>                    /* finish later */
>>>>>>>                    return;
>>>>>>>                case IO_UNHANDLED:
>>>>>>>                    /* IO unhandled, try another way to handle 
>>>>>>> it. */
>>>>>>>                    break;
>>>>>>>                default:
>>>>>>>                    ASSERT_UNREACHABLE();
>>>>>>>                }
>>>>>>>
>>>>>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>>>>>> handle_hvm_io_completion() after QEMU completes the emulation. 
>>>>>>> Today,
>>>>>>> handle_mmio just sets the user register with the read value.
>>>>>>>
>>>>>>> But it would be better if it called again the original function
>>>>>>> do_trap_stage2_abort_guest to actually retry the original 
>>>>>>> operation.
>>>>>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and 
>>>>>>> gets
>>>>>>> IO_HANDLED instead of IO_RETRY,
>>>>>> I may miss some important point, but I failed to see why 
>>>>>> try_handle_mmio
>>>>>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this
>>>>>> stage.
>>>>>> Or current try_fwd_ioserv() logic needs rework?
>>>>> I think you should check the ioreq->state in try_fwd_ioserv(), if the
>>>>> result is ready, then ioreq->state should be STATE_IORESP_READY, 
>>>>> and you
>>>>> can return IO_HANDLED.
>>>>
>>>
>>> I optimized test patch a bit (now it looks much simpler). I didn't 
>>> face any
>>> issues during a quick test.
>>
>> Both patches get much closer to following the proper state machine,
>> great! I think this patch is certainly a good improvement. I think the
>> other patch you sent earlier, slightly larger, is even better. It makes
>> the following additional changes that would be good to have:
>>
>> - try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
>> - handle_mmio simply calls do_trap_stage2_abort_guest
>
> I don't think we should call do_trap_stage2_abort_guest() as part of 
> the completion because:
>     * The function do_trap_stage2_abort_guest() is using registers 
> that are not context switched (such as FAR_EL2). I/O handling is split 
> in two with likely a context switch in the middle. The second part is 
> the completion (i.e call to handle_mmio()). So the system registers 
> will be incorrect.
>     * A big chunk of do_trap_stage2_abort_guest() is not necessary for 
> the completion. For instance, there is no need to try to translate the 
> guest virtual address to a guest physical address.
>
> Therefore the version below is probably the best approach.


Indeed, the first version (with calling do_trap_stage2_abort_guest() for 
a completion) is a racy. When testing it more heavily I faced an issue 
(sometimes) which resulted in DomU got stuck completely.

(XEN) d2v1: vGICD: bad read width 0 r11 offset 0x000f00

I didn't investigate an issue in detail, but I assumed that code in 
do_trap_stage2_abort_guest() caused that. This was the main reason why I 
decided to optimize an initial patch (and took only advance_pc).
Reading Julien's answer I understand now what could happen.
Julien Grall Aug. 11, 2020, 11:28 a.m. UTC | #35
On 11/08/2020 00:34, Stefano Stabellini wrote:
> On Mon, 10 Aug 2020, Julien Grall wrote:
>> On 07/08/2020 00:48, Stefano Stabellini wrote:
>>> On Thu, 6 Aug 2020, Julien Grall wrote:
>>>> On 06/08/2020 01:37, Stefano Stabellini wrote:
>>>>> On Wed, 5 Aug 2020, Julien Grall wrote:
>>>>>> On 04/08/2020 20:11, Stefano Stabellini wrote:
>>>>>>> On Tue, 4 Aug 2020, Julien Grall wrote:
>>>>>>>> On 04/08/2020 12:10, Oleksandr wrote:
>>>>>>>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>>>>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t
>>>>>>>>>>> *ioreq)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>>>>>>>> +           !ioreq->data_is_ptr &&
>>>>>>>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>>>>>>>> IOREQ_WRITE);
>>>>>>>>>>> +}
>>>>>>>>>> I don't think having this in common code is correct. The
>>>>>>>>>> short-cut
>>>>>>>>>> of
>>>>>>>>>> not
>>>>>>>>>> completing PIO reads seems somewhat x86 specific.
>>>>>>>>
>>>>>>>> Hmmm, looking at the code, I think it doesn't wait for PIO writes
>>>>>>>> to
>>>>>>>> complete
>>>>>>>> (not read). Did I miss anything?
>>>>>>>>
>>>>>>>>> Does ARM even
>>>>>>>>>> have the concept of PIO?
>>>>>>>>>
>>>>>>>>> I am not 100% sure here, but it seems that doesn't have.
>>>>>>>>
>>>>>>>> Technically, the PIOs exist on Arm, however they are accessed the
>>>>>>>> same
>>>>>>>> way
>>>>>>>> as
>>>>>>>> MMIO and will have a dedicated area defined by the HW.
>>>>>>>>
>>>>>>>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>>>>>>>
>>>>>>>> Now the question is whether we want to expose them to the Device
>>>>>>>> Emulator
>>>>>>>> as
>>>>>>>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to
>>>>>>>> care
>>>>>>>> about
>>>>>>>> the
>>>>>>>> architecture used. It should just be able to request a given
>>>>>>>> IOport
>>>>>>>> region.
>>>>>>>>
>>>>>>>> So it may make sense to differentiate them in the common ioreq
>>>>>>>> code as
>>>>>>>> well.
>>>>>>>>
>>>>>>>> I had a quick look at QEMU and wasn't able to tell if PIOs and
>>>>>>>> MMIOs
>>>>>>>> address
>>>>>>>> space are different on Arm as well. Paul, Stefano, do you know
>>>>>>>> what
>>>>>>>> they
>>>>>>>> are
>>>>>>>> doing?
>>>>>>>
>>>>>>> On the QEMU side, it looks like PIO (address_space_io) is used in
>>>>>>> connection with the emulation of the "in" or "out" instructions, see
>>>>>>> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
>>>>>>> space regardless of the architecture, such as
>>>>>>> hw/pci/pci_bridge.c:pci_bridge_initfn.
>>>>>>>
>>>>>>> However, because there is no "in" and "out" on ARM, I don't think
>>>>>>> address_space_io can be accessed. Specifically, there is no
>>>>>>> equivalent
>>>>>>> for target/i386/misc_helper.c:helper_inb on ARM.
>>>>>>
>>>>>> So how PCI I/O BAR are accessed? Surely, they could be used on Arm,
>>>>>> right?
>>>>>
>>>>> PIO is also memory mapped on ARM and it seems to have its own MMIO
>>>>> address window.
>>>> This part is already well-understood :). However, this only tell us how an
>>>> OS
>>>> is accessing a PIO.
>>>>
>>>> What I am trying to figure out is how the hardware (or QEMU) is meant to
>>>> work.
>>>>
>>>>   From my understanding, the MMIO access will be received by the hostbridge
>>>> and
>>>> then forwarded to the appropriate PCI device. The two questions I am
>>>> trying to
>>>> answer is: How the I/O BARs are configured? Will it contain an MMIO
>>>> address or
>>>> an offset?
>>>>
>>>> If the answer is the latter, then we will need PIO because a DM will never
>>>> see
>>>> the MMIO address (the hostbridge will be emulated in Xen).
>>>
>>> Now I understand the question :-)
>>>
>>> This is the way I understand it works. Let's say that the PIO aperture
>>> is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
>>> 0x1000-0x2000 are addresses that cannot be accessed directly.
>>> 0x3eff0000-0x3eff1000 is the range that works.
>>>
>>> A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
>>> for instance 0x1100.

Are you sure about this?

>>>
>>> However, when the operating system access 0x1100, it will issue a read
>>> to 0x3eff0100.
>>>
>>> Xen will trap the read to 0x3eff0100 and send it to QEMU.
>>>
>>> QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
>>> aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
>>> QEMU has also to know the address range of the MMIO aperture and its
>>> remappings, if any (it is possible to have address remapping for MMIO
>>> addresses too.)
>>>
>>> I think today this information is "built-in" QEMU, not configurable. It
>>> works fine because *I think* the PCI aperture is pretty much the same on
>>> x86 boards, at least the one supported by QEMU for Xen.
>>
>> Well on x86, the OS will access PIO using inb/outb. So the address received by
>> Xen is 0x1000-0x2000 and then forwarded to the DM using the PIO type.
>>
>>> On ARM, I think we should explicitly declare the PCI MMIO aperture and
>>> its alias/address-remapping. When we do that, we can also declare the
>>> PIO aperture and its alias/address-remapping.
>>
>> Well yes, we need to define PCI MMIO and PCI I/O region because the guest OS
>> needs to know them.
> 
> [1]
> (see below)
> 
> 
>> However, I am unsure how this would help us to solve the question whether
>> access to the PCI I/O aperture should be sent as a PIO or MMIO.
>>
>> Per what you wrote, the PCI I/O Bar would be configured with the range
>> 0x1000-0x2000. So a device emulator (this may not be QEMU and only emulate one
>> PCI device!!) will only see that range.
>>
>> How does the device-emulator then know that it needs to watch the region
>> 0x3eff0000-0x3eff1000?
> 
> It would know because the PCI PIO aperture, together with the alias, are
> specified [1].

Are you suggesting fix it in the ABI or pass it as runtime information 
to the Device Emulator?

> 
> 
>> It feels to me that it would be easier/make more sense if the DM only say "I
>> want to watch the PIO range 0x1000-0x2000". So Xen would be in charge to do
>> the translation between the OS view and the DM view.
>>
>> This also means a DM would be completely arch-agnostic. This would follow the
>> HW where you can plug your PCI card on any HW.
> 
> As you know, PIO access is actually not modelled by QEMU for ARM
> targets. I worry about the long term stability of it, given that it is
> untested.  I.e. qemu-system-aarch64 could have a broken PIO emulation
> and nobody would find out except for us when we send ioreqs to it.

There are multiple references of PIO in the QEMU for Arm (see 
hw/arm/virt.c). So what do you mean by not modelled?

> Thinking from a Xen/Emulator interface on ARM, is it wise to rely on an
> access-type that doesn't exist on the architecture?

The architecture doesn't define an instruction to access PIO, however 
this doesn't mean such access doesn't exist on the platform.

For instance, PCI device may have I/O BAR. On Arm64, the hostbridge will 
be responsible to do the translation between the MMIO access to a PIO 
access for the PCI device.

I have the impression that we disagree in what the Device Emulator is 
meant to do. IHMO, the goal of the device emulator is to emulate a 
device in an arch-agnostic way.

Cheers,
Stefano Stabellini Aug. 11, 2020, 10:47 p.m. UTC | #36
On Tue, 11 Aug 2020, Oleksandr wrote:
> On 11.08.20 12:19, Julien Grall wrote:
> > On 11/08/2020 00:34, Stefano Stabellini wrote:
> > > On Mon, 10 Aug 2020, Oleksandr wrote:
> > > > On 08.08.20 01:19, Oleksandr wrote:
> > > > > On 08.08.20 00:50, Stefano Stabellini wrote:
> > > > > > On Fri, 7 Aug 2020, Oleksandr wrote:
> > > > > > > On 06.08.20 03:37, Stefano Stabellini wrote:
> > > > > > > 
> > > > > > > Hi Stefano
> > > > > > > 
> > > > > > > Trying to simulate IO_RETRY handling mechanism (according to model
> > > > > > > below) I
> > > > > > > continuously get IO_RETRY from try_fwd_ioserv() ...
> > > > > > > 
> > > > > > > > OK, thanks for the details. My interpretation seems to be
> > > > > > > > correct.
> > > > > > > > 
> > > > > > > > In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv
> > > > > > > > should
> > > > > > > > return IO_RETRY. Then,
> > > > > > > > xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > > > > > > > also needs to handle try_handle_mmio returning IO_RETRY the
> > > > > > > > first
> > > > > > > > around, and IO_HANDLED when after QEMU does its job.
> > > > > > > > 
> > > > > > > > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply
> > > > > > > > return
> > > > > > > > early and let the scheduler do its job? Something like:
> > > > > > > > 
> > > > > > > >                enum io_state state = try_handle_mmio(regs, hsr,
> > > > > > > > gpa);
> > > > > > > > 
> > > > > > > >                switch ( state )
> > > > > > > >                {
> > > > > > > >                case IO_ABORT:
> > > > > > > >                    goto inject_abt;
> > > > > > > >                case IO_HANDLED:
> > > > > > > >                    advance_pc(regs, hsr);
> > > > > > > >                    return;
> > > > > > > >                case IO_RETRY:
> > > > > > > >                    /* finish later */
> > > > > > > >                    return;
> > > > > > > >                case IO_UNHANDLED:
> > > > > > > >                    /* IO unhandled, try another way to handle
> > > > > > > > it. */
> > > > > > > >                    break;
> > > > > > > >                default:
> > > > > > > >                    ASSERT_UNREACHABLE();
> > > > > > > >                }
> > > > > > > > 
> > > > > > > > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > > > > > > > handle_hvm_io_completion() after QEMU completes the emulation.
> > > > > > > > Today,
> > > > > > > > handle_mmio just sets the user register with the read value.
> > > > > > > > 
> > > > > > > > But it would be better if it called again the original function
> > > > > > > > do_trap_stage2_abort_guest to actually retry the original
> > > > > > > > operation.
> > > > > > > > This time do_trap_stage2_abort_guest calls try_handle_mmio() and
> > > > > > > > gets
> > > > > > > > IO_HANDLED instead of IO_RETRY,
> > > > > > > I may miss some important point, but I failed to see why
> > > > > > > try_handle_mmio
> > > > > > > (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at
> > > > > > > this
> > > > > > > stage.
> > > > > > > Or current try_fwd_ioserv() logic needs rework?
> > > > > > I think you should check the ioreq->state in try_fwd_ioserv(), if
> > > > > > the
> > > > > > result is ready, then ioreq->state should be STATE_IORESP_READY, and
> > > > > > you
> > > > > > can return IO_HANDLED.
> > > > > 
> > > > 
> > > > I optimized test patch a bit (now it looks much simpler). I didn't face
> > > > any
> > > > issues during a quick test.
> > > 
> > > Both patches get much closer to following the proper state machine,
> > > great! I think this patch is certainly a good improvement. I think the
> > > other patch you sent earlier, slightly larger, is even better. It makes
> > > the following additional changes that would be good to have:
> > > 
> > > - try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
> > > - handle_mmio simply calls do_trap_stage2_abort_guest
> > 
> > I don't think we should call do_trap_stage2_abort_guest() as part of the
> > completion because:
> >     * The function do_trap_stage2_abort_guest() is using registers that are
> > not context switched (such as FAR_EL2). I/O handling is split in two with
> > likely a context switch in the middle. The second part is the completion
> > (i.e call to handle_mmio()). So the system registers will be incorrect.
> >     * A big chunk of do_trap_stage2_abort_guest() is not necessary for the
> > completion. For instance, there is no need to try to translate the guest
> > virtual address to a guest physical address.
> > 
> > Therefore the version below is probably the best approach.
> 
> 
> Indeed, the first version (with calling do_trap_stage2_abort_guest() for a
> completion) is a racy. When testing it more heavily I faced an issue
> (sometimes) which resulted in DomU got stuck completely.
> 
> (XEN) d2v1: vGICD: bad read width 0 r11 offset 0x000f00
> 
> I didn't investigate an issue in detail, but I assumed that code in
> do_trap_stage2_abort_guest() caused that. This was the main reason why I
> decided to optimize an initial patch (and took only advance_pc).
> Reading Julien's answer I understand now what could happen.

From your and Julien's feedback it is clear that calling
do_trap_stage2_abort_guest() is not possible and not a good idea.


The reason for my suggestion was to complete the implementation of the
state machine so that "RETRY" actually means "let's try again the
emulation" but the second time it will return "HANDLED".

Looking at this again, we could achieve the same goal in a better way by
moving the register setting from "handle_mmio" to "try_handle_mmio" and
also calling "try_handle_mmio" from "handle_mmio". Note that handle_mmio
would become almost empty like on x86.

1) do_trap_stage2_abort_guest ->
       try_handle_mmio ->
            try_fwd_ioserv ->
                IO_RETRY

2) handle_hvm_io_completion ->
       handle_mmio ->
           try_handle_mmio ->
               try_fwd_ioserv ->
                   IO_HANDLED


It is very similar to your second patch with a small change on calling
try_handle_mmio from handle_mmio and setting the register there. Do you
think that would work?
Stefano Stabellini Aug. 11, 2020, 10:48 p.m. UTC | #37
On Tue, 11 Aug 2020, Julien Grall wrote:
> On 11/08/2020 00:34, Stefano Stabellini wrote:
> > On Mon, 10 Aug 2020, Julien Grall wrote:
> > > On 07/08/2020 00:48, Stefano Stabellini wrote:
> > > > On Thu, 6 Aug 2020, Julien Grall wrote:
> > > > > On 06/08/2020 01:37, Stefano Stabellini wrote:
> > > > > > On Wed, 5 Aug 2020, Julien Grall wrote:
> > > > > > > On 04/08/2020 20:11, Stefano Stabellini wrote:
> > > > > > > > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > > > > > > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > > > > > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > > > > > > > +static inline bool hvm_ioreq_needs_completion(const
> > > > > > > > > > > > ioreq_t
> > > > > > > > > > > > *ioreq)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > > > > > > > +           !ioreq->data_is_ptr &&
> > > > > > > > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir
> > > > > > > > > > > > !=
> > > > > > > > > > > > IOREQ_WRITE);
> > > > > > > > > > > > +}
> > > > > > > > > > > I don't think having this in common code is correct. The
> > > > > > > > > > > short-cut
> > > > > > > > > > > of
> > > > > > > > > > > not
> > > > > > > > > > > completing PIO reads seems somewhat x86 specific.
> > > > > > > > > 
> > > > > > > > > Hmmm, looking at the code, I think it doesn't wait for PIO
> > > > > > > > > writes
> > > > > > > > > to
> > > > > > > > > complete
> > > > > > > > > (not read). Did I miss anything?
> > > > > > > > > 
> > > > > > > > > > Does ARM even
> > > > > > > > > > > have the concept of PIO?
> > > > > > > > > > 
> > > > > > > > > > I am not 100% sure here, but it seems that doesn't have.
> > > > > > > > > 
> > > > > > > > > Technically, the PIOs exist on Arm, however they are accessed
> > > > > > > > > the
> > > > > > > > > same
> > > > > > > > > way
> > > > > > > > > as
> > > > > > > > > MMIO and will have a dedicated area defined by the HW.
> > > > > > > > > 
> > > > > > > > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > > > > > > > > 
> > > > > > > > > Now the question is whether we want to expose them to the
> > > > > > > > > Device
> > > > > > > > > Emulator
> > > > > > > > > as
> > > > > > > > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to
> > > > > > > > > care
> > > > > > > > > about
> > > > > > > > > the
> > > > > > > > > architecture used. It should just be able to request a given
> > > > > > > > > IOport
> > > > > > > > > region.
> > > > > > > > > 
> > > > > > > > > So it may make sense to differentiate them in the common ioreq
> > > > > > > > > code as
> > > > > > > > > well.
> > > > > > > > > 
> > > > > > > > > I had a quick look at QEMU and wasn't able to tell if PIOs and
> > > > > > > > > MMIOs
> > > > > > > > > address
> > > > > > > > > space are different on Arm as well. Paul, Stefano, do you know
> > > > > > > > > what
> > > > > > > > > they
> > > > > > > > > are
> > > > > > > > > doing?
> > > > > > > > 
> > > > > > > > On the QEMU side, it looks like PIO (address_space_io) is used
> > > > > > > > in
> > > > > > > > connection with the emulation of the "in" or "out" instructions,
> > > > > > > > see
> > > > > > > > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate
> > > > > > > > PIO
> > > > > > > > space regardless of the architecture, such as
> > > > > > > > hw/pci/pci_bridge.c:pci_bridge_initfn.
> > > > > > > > 
> > > > > > > > However, because there is no "in" and "out" on ARM, I don't
> > > > > > > > think
> > > > > > > > address_space_io can be accessed. Specifically, there is no
> > > > > > > > equivalent
> > > > > > > > for target/i386/misc_helper.c:helper_inb on ARM.
> > > > > > > 
> > > > > > > So how PCI I/O BAR are accessed? Surely, they could be used on
> > > > > > > Arm,
> > > > > > > right?
> > > > > > 
> > > > > > PIO is also memory mapped on ARM and it seems to have its own MMIO
> > > > > > address window.
> > > > > This part is already well-understood :). However, this only tell us
> > > > > how an
> > > > > OS
> > > > > is accessing a PIO.
> > > > > 
> > > > > What I am trying to figure out is how the hardware (or QEMU) is meant
> > > > > to
> > > > > work.
> > > > > 
> > > > >   From my understanding, the MMIO access will be received by the
> > > > > hostbridge
> > > > > and
> > > > > then forwarded to the appropriate PCI device. The two questions I am
> > > > > trying to
> > > > > answer is: How the I/O BARs are configured? Will it contain an MMIO
> > > > > address or
> > > > > an offset?
> > > > > 
> > > > > If the answer is the latter, then we will need PIO because a DM will
> > > > > never
> > > > > see
> > > > > the MMIO address (the hostbridge will be emulated in Xen).
> > > > 
> > > > Now I understand the question :-)
> > > > 
> > > > This is the way I understand it works. Let's say that the PIO aperture
> > > > is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
> > > > 0x1000-0x2000 are addresses that cannot be accessed directly.
> > > > 0x3eff0000-0x3eff1000 is the range that works.
> > > > 
> > > > A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
> > > > for instance 0x1100.
> 
> Are you sure about this?

I am pretty sure, but only from reading the code. It would be great if
somebody ran QEMU and actually tested it. This is important because it
could make the whole discussion moot :-)


> > > > However, when the operating system access 0x1100, it will issue a read
> > > > to 0x3eff0100.
> > > > 
> > > > Xen will trap the read to 0x3eff0100 and send it to QEMU.
> > > > 
> > > > QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
> > > > aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
> > > > QEMU has also to know the address range of the MMIO aperture and its
> > > > remappings, if any (it is possible to have address remapping for MMIO
> > > > addresses too.)
> > > > 
> > > > I think today this information is "built-in" QEMU, not configurable. It
> > > > works fine because *I think* the PCI aperture is pretty much the same on
> > > > x86 boards, at least the one supported by QEMU for Xen.
> > > 
> > > Well on x86, the OS will access PIO using inb/outb. So the address
> > > received by
> > > Xen is 0x1000-0x2000 and then forwarded to the DM using the PIO type.
> > > 
> > > > On ARM, I think we should explicitly declare the PCI MMIO aperture and
> > > > its alias/address-remapping. When we do that, we can also declare the
> > > > PIO aperture and its alias/address-remapping.
> > > 
> > > Well yes, we need to define PCI MMIO and PCI I/O region because the guest
> > > OS
> > > needs to know them.
> > 
> > [1]
> > (see below)
> > 
> > 
> > > However, I am unsure how this would help us to solve the question whether
> > > access to the PCI I/O aperture should be sent as a PIO or MMIO.
> > > 
> > > Per what you wrote, the PCI I/O Bar would be configured with the range
> > > 0x1000-0x2000. So a device emulator (this may not be QEMU and only emulate
> > > one
> > > PCI device!!) will only see that range.
> > > 
> > > How does the device-emulator then know that it needs to watch the region
> > > 0x3eff0000-0x3eff1000?
> > 
> > It would know because the PCI PIO aperture, together with the alias, are
> > specified [1].
> 
> Are you suggesting fix it in the ABI or pass it as runtime information to the
> Device Emulator?

I am suggesting of "fixing" it in the ABI. Whether we pass it at
runtime or not is less important I think.
 
 
> > > It feels to me that it would be easier/make more sense if the DM only say
> > > "I
> > > want to watch the PIO range 0x1000-0x2000". So Xen would be in charge to
> > > do
> > > the translation between the OS view and the DM view.
> > > 
> > > This also means a DM would be completely arch-agnostic. This would follow
> > > the
> > > HW where you can plug your PCI card on any HW.
> > 
> > As you know, PIO access is actually not modelled by QEMU for ARM
> > targets. I worry about the long term stability of it, given that it is
> > untested.  I.e. qemu-system-aarch64 could have a broken PIO emulation
> > and nobody would find out except for us when we send ioreqs to it.
> 
> There are multiple references of PIO in the QEMU for Arm (see hw/arm/virt.c).
> So what do you mean by not modelled?

I mean that PIO is only emulated as MMIO region, not as port-mapped I/O.


> > Thinking from a Xen/Emulator interface on ARM, is it wise to rely on an
> > access-type that doesn't exist on the architecture?
> 
> The architecture doesn't define an instruction to access PIO, however this
> doesn't mean such access doesn't exist on the platform.
> 
> For instance, PCI device may have I/O BAR. On Arm64, the hostbridge will be
> responsible to do the translation between the MMIO access to a PIO access for
> the PCI device.

As far as I understand the host bridge is responsible for any
translations between the host address space and the address space of
devices. Even for MMIO addresses there are translations. The hostbridge
doesn't do any port-mapped I/O to communicate with the device. There is
a different protocol for those communications. Port-mapped I/O is how it
is exposed to the host by the hostbridge.

If we wanted to emulate the hostbridge/device interface properly we
would end up with something pretty different.


> I have the impression that we disagree in what the Device Emulator is meant to
> do. IHMO, the goal of the device emulator is to emulate a device in an
> arch-agnostic way.

That would be great in theory but I am not sure it is achievable: if we
use an existing emulator like QEMU, even a single device has to fit
into QEMU's view of the world, which makes assumptions about host
bridges and apertures. It is impossible today to build QEMU in an
arch-agnostic way, it has to be tied to an architecture.

I realize we are not building this interface for QEMU specifically, but
even if we try to make the interface arch-agnostic, in reality the
emulators won't be arch-agnostic. If we send a port-mapped I/O request
to qemu-system-aarch64 who knows what is going to happen: it is a code
path that it is not explicitly tested.
Julien Grall Aug. 12, 2020, 8:19 a.m. UTC | #38
Hi,

On 11/08/2020 23:48, Stefano Stabellini wrote:
>> I have the impression that we disagree in what the Device Emulator is meant to
>> do. IHMO, the goal of the device emulator is to emulate a device in an
>> arch-agnostic way.
> 
> That would be great in theory but I am not sure it is achievable: if we
> use an existing emulator like QEMU, even a single device has to fit
> into QEMU's view of the world, which makes assumptions about host
> bridges and apertures. It is impossible today to build QEMU in an
> arch-agnostic way, it has to be tied to an architecture.

AFAICT, the only reason QEMU cannot build be in an arch-agnostic way is 
because of TCG. If this wasn't built then you could easily write a 
machine that doesn't depend on the instruction set.

The proof is, today, we are using QEMU x86 to serve Arm64 guest. 
Although this is only for PV drivers.

> 
> I realize we are not building this interface for QEMU specifically, but
> even if we try to make the interface arch-agnostic, in reality the
> emulators won't be arch-agnostic.

This depends on your goal. If your goal is to write a standalone 
emulator for a single device, then it is entirely possible to make it 
arch-agnostic.

Per above, this would even be possible if you were emulating a set of 
devices.

What I want to avoid is requiring all the emulators to contain 
arch-specific code just because it is easier to get QEMU working on Xen 
on Arm.

> If we send a port-mapped I/O request
> to qemu-system-aarch64 who knows what is going to happen: it is a code
> path that it is not explicitly tested.

Maybe, maybe not. To me this is mostly software issues that can easily 
be mitigated if we do proper testing...

Cheers,
Oleksandr Tyshchenko Aug. 12, 2020, 2:35 p.m. UTC | #39
On 12.08.20 01:47, Stefano Stabellini wrote:

Hi Stefano

> On Tue, 11 Aug 2020, Oleksandr wrote:
>> On 11.08.20 12:19, Julien Grall wrote:
>>> On 11/08/2020 00:34, Stefano Stabellini wrote:
>>>> On Mon, 10 Aug 2020, Oleksandr wrote:
>>>>> On 08.08.20 01:19, Oleksandr wrote:
>>>>>> On 08.08.20 00:50, Stefano Stabellini wrote:
>>>>>>> On Fri, 7 Aug 2020, Oleksandr wrote:
>>>>>>>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>>>>>>>
>>>>>>>> Hi Stefano
>>>>>>>>
>>>>>>>> Trying to simulate IO_RETRY handling mechanism (according to model
>>>>>>>> below) I
>>>>>>>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>>>>>>>
>>>>>>>>> OK, thanks for the details. My interpretation seems to be
>>>>>>>>> correct.
>>>>>>>>>
>>>>>>>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv
>>>>>>>>> should
>>>>>>>>> return IO_RETRY. Then,
>>>>>>>>> xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>>>>>>>> also needs to handle try_handle_mmio returning IO_RETRY the
>>>>>>>>> first
>>>>>>>>> around, and IO_HANDLED when after QEMU does its job.
>>>>>>>>>
>>>>>>>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply
>>>>>>>>> return
>>>>>>>>> early and let the scheduler do its job? Something like:
>>>>>>>>>
>>>>>>>>>                 enum io_state state = try_handle_mmio(regs, hsr,
>>>>>>>>> gpa);
>>>>>>>>>
>>>>>>>>>                 switch ( state )
>>>>>>>>>                 {
>>>>>>>>>                 case IO_ABORT:
>>>>>>>>>                     goto inject_abt;
>>>>>>>>>                 case IO_HANDLED:
>>>>>>>>>                     advance_pc(regs, hsr);
>>>>>>>>>                     return;
>>>>>>>>>                 case IO_RETRY:
>>>>>>>>>                     /* finish later */
>>>>>>>>>                     return;
>>>>>>>>>                 case IO_UNHANDLED:
>>>>>>>>>                     /* IO unhandled, try another way to handle
>>>>>>>>> it. */
>>>>>>>>>                     break;
>>>>>>>>>                 default:
>>>>>>>>>                     ASSERT_UNREACHABLE();
>>>>>>>>>                 }
>>>>>>>>>
>>>>>>>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>>>>>>>> handle_hvm_io_completion() after QEMU completes the emulation.
>>>>>>>>> Today,
>>>>>>>>> handle_mmio just sets the user register with the read value.
>>>>>>>>>
>>>>>>>>> But it would be better if it called again the original function
>>>>>>>>> do_trap_stage2_abort_guest to actually retry the original
>>>>>>>>> operation.
>>>>>>>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and
>>>>>>>>> gets
>>>>>>>>> IO_HANDLED instead of IO_RETRY,
>>>>>>>> I may miss some important point, but I failed to see why
>>>>>>>> try_handle_mmio
>>>>>>>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at
>>>>>>>> this
>>>>>>>> stage.
>>>>>>>> Or current try_fwd_ioserv() logic needs rework?
>>>>>>> I think you should check the ioreq->state in try_fwd_ioserv(), if
>>>>>>> the
>>>>>>> result is ready, then ioreq->state should be STATE_IORESP_READY, and
>>>>>>> you
>>>>>>> can return IO_HANDLED.
>>>>> I optimized test patch a bit (now it looks much simpler). I didn't face
>>>>> any
>>>>> issues during a quick test.
>>>> Both patches get much closer to following the proper state machine,
>>>> great! I think this patch is certainly a good improvement. I think the
>>>> other patch you sent earlier, slightly larger, is even better. It makes
>>>> the following additional changes that would be good to have:
>>>>
>>>> - try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
>>>> - handle_mmio simply calls do_trap_stage2_abort_guest
>>> I don't think we should call do_trap_stage2_abort_guest() as part of the
>>> completion because:
>>>      * The function do_trap_stage2_abort_guest() is using registers that are
>>> not context switched (such as FAR_EL2). I/O handling is split in two with
>>> likely a context switch in the middle. The second part is the completion
>>> (i.e call to handle_mmio()). So the system registers will be incorrect.
>>>      * A big chunk of do_trap_stage2_abort_guest() is not necessary for the
>>> completion. For instance, there is no need to try to translate the guest
>>> virtual address to a guest physical address.
>>>
>>> Therefore the version below is probably the best approach.
>>
>> Indeed, the first version (with calling do_trap_stage2_abort_guest() for a
>> completion) is a racy. When testing it more heavily I faced an issue
>> (sometimes) which resulted in DomU got stuck completely.
>>
>> (XEN) d2v1: vGICD: bad read width 0 r11 offset 0x000f00
>>
>> I didn't investigate an issue in detail, but I assumed that code in
>> do_trap_stage2_abort_guest() caused that. This was the main reason why I
>> decided to optimize an initial patch (and took only advance_pc).
>> Reading Julien's answer I understand now what could happen.
>  From your and Julien's feedback it is clear that calling
> do_trap_stage2_abort_guest() is not possible and not a good idea.
>
>
> The reason for my suggestion was to complete the implementation of the
> state machine so that "RETRY" actually means "let's try again the
> emulation" but the second time it will return "HANDLED".
>
> Looking at this again, we could achieve the same goal in a better way by
> moving the register setting from "handle_mmio" to "try_handle_mmio" and
> also calling "try_handle_mmio" from "handle_mmio". Note that handle_mmio
> would become almost empty like on x86.
>
> 1) do_trap_stage2_abort_guest ->
>         try_handle_mmio ->
>              try_fwd_ioserv ->
>                  IO_RETRY
>
> 2) handle_hvm_io_completion ->
>         handle_mmio ->
>             try_handle_mmio ->
>                 try_fwd_ioserv ->
>                     IO_HANDLED
>
>
> It is very similar to your second patch with a small change on calling
> try_handle_mmio from handle_mmio and setting the register there. Do you
> think that would work?
If I understood correctly what you had suggested and properly 
implemented then it works, at least I didn't face any issues during testing.
I think this variant adds some extra operations comparing to the 
previous one (for example an attempt to find a mmio handler at 
try_handle_mmio). But, if you think new variant is cleaner and better 
represents how the state machine should look like, I would be absolutely 
OK to take this variant for non-RFC series. Please note, there was a 
request to move try_fwd_ioserv() to arm/ioreq.c (I am going to move new 
handle_ioserv() as well).


---
  xen/arch/arm/io.c    | 47 +++++++++++++++++++++++++++++++++++++++++++----
  xen/arch/arm/ioreq.c | 38 +++++++-------------------------------
  xen/arch/arm/traps.c |  4 +++-
  3 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 436f669..4db7c55 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -109,6 +109,43 @@ static const struct mmio_handler 
*find_mmio_handler(struct domain *d,
  }

  #ifdef CONFIG_IOREQ_SERVER
+static enum io_state handle_ioserv(struct cpu_user_regs *regs, struct 
vcpu *v)
+{
+    const union hsr hsr = { .bits = regs->hsr };
+    const struct hsr_dabt dabt = hsr.dabt;
+    /* Code is similar to handle_read */
+    uint8_t size = (1 << dabt.size) * 8;
+    register_t r = v->arch.hvm.hvm_io.io_req.data;
+
+    /* We are done with the IO */
+    /* XXX: Is it the right place? */
+    v->arch.hvm.hvm_io.io_req.state = STATE_IOREQ_NONE;
+
+    /* XXX: Do we need to take care of write here ? */
+    if ( dabt.write )
+        return IO_HANDLED;
+
+    /*
+     * Sign extend if required.
+     * Note that we expect the read handler to have zeroed the bits
+     * outside the requested access size.
+     */
+    if ( dabt.sign && (r & (1UL << (size - 1))) )
+    {
+        /*
+         * We are relying on register_t using the same as
+         * an unsigned long in order to keep the 32-bit assembly
+         * code smaller.
+         */
+        BUILD_BUG_ON(sizeof(register_t) != sizeof(unsigned long));
+        r |= (~0UL) << size;
+    }
+
+    set_user_reg(regs, dabt.reg, r);
+
+    return IO_HANDLED;
+}
+
  static enum io_state try_fwd_ioserv(struct cpu_user_regs *regs,
                                      struct vcpu *v, mmio_info_t *info)
  {
@@ -130,6 +167,10 @@ static enum io_state try_fwd_ioserv(struct 
cpu_user_regs *regs,
      {
      case STATE_IOREQ_NONE:
          break;
+
+    case STATE_IORESP_READY:
+        return IO_HANDLED;
+
      default:
          printk("d%u wrong state %u\n", v->domain->domain_id,
                 vio->io_req.state);
@@ -156,10 +197,6 @@ static enum io_state try_fwd_ioserv(struct 
cpu_user_regs *regs,
      else
          vio->io_completion = HVMIO_mmio_completion;

-    /* XXX: Decide what to do */
-    if ( rc == IO_RETRY )
-        rc = IO_HANDLED;
-
      return rc;
  }
  #endif
@@ -185,6 +222,8 @@ enum io_state try_handle_mmio(struct cpu_user_regs 
*regs,

  #ifdef CONFIG_IOREQ_SERVER
          rc = try_fwd_ioserv(regs, v, &info);
+        if ( rc == IO_HANDLED )
+            return handle_ioserv(regs, v);
  #endif

          return rc;
diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
index 8f60c41..9068b8d 100644
--- a/xen/arch/arm/ioreq.c
+++ b/xen/arch/arm/ioreq.c
@@ -33,46 +33,22 @@
  #include <public/hvm/dm_op.h>
  #include <public/hvm/ioreq.h>

+#include <asm/traps.h>
+
  bool handle_mmio(void)
  {
      struct vcpu *v = current;
      struct cpu_user_regs *regs = guest_cpu_user_regs();
      const union hsr hsr = { .bits = regs->hsr };
-    const struct hsr_dabt dabt = hsr.dabt;
-    /* Code is similar to handle_read */
-    uint8_t size = (1 << dabt.size) * 8;
-    register_t r = v->arch.hvm.hvm_io.io_req.data;
-
-    /* We should only be here on Guest Data Abort */
-    ASSERT(dabt.ec == HSR_EC_DATA_ABORT_LOWER_EL);
+    paddr_t addr = v->arch.hvm.hvm_io.io_req.addr;

-    /* We are done with the IO */
-    /* XXX: Is it the right place? */
-    v->arch.hvm.hvm_io.io_req.state = STATE_IOREQ_NONE;
-
-    /* XXX: Do we need to take care of write here ? */
-    if ( dabt.write )
-        return true;
-
-    /*
-     * Sign extend if required.
-     * Note that we expect the read handler to have zeroed the bits
-     * outside the requested access size.
-     */
-    if ( dabt.sign && (r & (1UL << (size - 1))) )
+    if ( try_handle_mmio(regs, hsr, addr) == IO_HANDLED )
      {
-        /*
-         * We are relying on register_t using the same as
-         * an unsigned long in order to keep the 32-bit assembly
-         * code smaller.
-         */
-        BUILD_BUG_ON(sizeof(register_t) != sizeof(unsigned long));
-        r |= (~0UL) << size;
+        advance_pc(regs, hsr);
+        return true;
      }

-    set_user_reg(regs, dabt.reg, r);
-
-    return true;
+    return false;
  }

  /* Ask ioemu mapcache to invalidate mappings. */
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index ea472d1..974c744 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct 
cpu_user_regs *regs,
              case IO_HANDLED:
                  advance_pc(regs, hsr);
                  return;
+            case IO_RETRY:
+                /* finish later */
+                return;
              case IO_UNHANDLED:
                  /* IO unhandled, try another way to handle it. */
                  break;
              default:
-                /* XXX: Handle IO_RETRY */
                  ASSERT_UNREACHABLE();
              }
          }
Stefano Stabellini Aug. 12, 2020, 11:08 p.m. UTC | #40
On Wed, 12 Aug 2020, Oleksandr wrote:
> On 12.08.20 01:47, Stefano Stabellini wrote:
> > On Tue, 11 Aug 2020, Oleksandr wrote:
> > > On 11.08.20 12:19, Julien Grall wrote:
> > > > On 11/08/2020 00:34, Stefano Stabellini wrote:
> > > > > On Mon, 10 Aug 2020, Oleksandr wrote:
> > > > > > On 08.08.20 01:19, Oleksandr wrote:
> > > > > > > On 08.08.20 00:50, Stefano Stabellini wrote:
> > > > > > > > On Fri, 7 Aug 2020, Oleksandr wrote:
> > > > > > > > > On 06.08.20 03:37, Stefano Stabellini wrote:
> > > > > > > > > 
> > > > > > > > > Hi Stefano
> > > > > > > > > 
> > > > > > > > > Trying to simulate IO_RETRY handling mechanism (according to
> > > > > > > > > model
> > > > > > > > > below) I
> > > > > > > > > continuously get IO_RETRY from try_fwd_ioserv() ...
> > > > > > > > > 
> > > > > > > > > > OK, thanks for the details. My interpretation seems to be
> > > > > > > > > > correct.
> > > > > > > > > > 
> > > > > > > > > > In which case, it looks like
> > > > > > > > > > xen/arch/arm/io.c:try_fwd_ioserv
> > > > > > > > > > should
> > > > > > > > > > return IO_RETRY. Then,
> > > > > > > > > > xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > > > > > > > > > also needs to handle try_handle_mmio returning IO_RETRY the
> > > > > > > > > > first
> > > > > > > > > > around, and IO_HANDLED when after QEMU does its job.
> > > > > > > > > > 
> > > > > > > > > > What should do_trap_stage2_abort_guest do on IO_RETRY?
> > > > > > > > > > Simply
> > > > > > > > > > return
> > > > > > > > > > early and let the scheduler do its job? Something like:
> > > > > > > > > > 
> > > > > > > > > >                 enum io_state state = try_handle_mmio(regs,
> > > > > > > > > > hsr,
> > > > > > > > > > gpa);
> > > > > > > > > > 
> > > > > > > > > >                 switch ( state )
> > > > > > > > > >                 {
> > > > > > > > > >                 case IO_ABORT:
> > > > > > > > > >                     goto inject_abt;
> > > > > > > > > >                 case IO_HANDLED:
> > > > > > > > > >                     advance_pc(regs, hsr);
> > > > > > > > > >                     return;
> > > > > > > > > >                 case IO_RETRY:
> > > > > > > > > >                     /* finish later */
> > > > > > > > > >                     return;
> > > > > > > > > >                 case IO_UNHANDLED:
> > > > > > > > > >                     /* IO unhandled, try another way to
> > > > > > > > > > handle
> > > > > > > > > > it. */
> > > > > > > > > >                     break;
> > > > > > > > > >                 default:
> > > > > > > > > >                     ASSERT_UNREACHABLE();
> > > > > > > > > >                 }
> > > > > > > > > > 
> > > > > > > > > > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > > > > > > > > > handle_hvm_io_completion() after QEMU completes the
> > > > > > > > > > emulation.
> > > > > > > > > > Today,
> > > > > > > > > > handle_mmio just sets the user register with the read value.
> > > > > > > > > > 
> > > > > > > > > > But it would be better if it called again the original
> > > > > > > > > > function
> > > > > > > > > > do_trap_stage2_abort_guest to actually retry the original
> > > > > > > > > > operation.
> > > > > > > > > > This time do_trap_stage2_abort_guest calls try_handle_mmio()
> > > > > > > > > > and
> > > > > > > > > > gets
> > > > > > > > > > IO_HANDLED instead of IO_RETRY,
> > > > > > > > > I may miss some important point, but I failed to see why
> > > > > > > > > try_handle_mmio
> > > > > > > > > (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at
> > > > > > > > > this
> > > > > > > > > stage.
> > > > > > > > > Or current try_fwd_ioserv() logic needs rework?
> > > > > > > > I think you should check the ioreq->state in try_fwd_ioserv(),
> > > > > > > > if
> > > > > > > > the
> > > > > > > > result is ready, then ioreq->state should be STATE_IORESP_READY,
> > > > > > > > and
> > > > > > > > you
> > > > > > > > can return IO_HANDLED.
> > > > > > I optimized test patch a bit (now it looks much simpler). I didn't
> > > > > > face
> > > > > > any
> > > > > > issues during a quick test.
> > > > > Both patches get much closer to following the proper state machine,
> > > > > great! I think this patch is certainly a good improvement. I think the
> > > > > other patch you sent earlier, slightly larger, is even better. It
> > > > > makes
> > > > > the following additional changes that would be good to have:
> > > > > 
> > > > > - try_fwd_ioserv returns IO_HANDLED on state == STATE_IORESP_READY
> > > > > - handle_mmio simply calls do_trap_stage2_abort_guest
> > > > I don't think we should call do_trap_stage2_abort_guest() as part of the
> > > > completion because:
> > > >      * The function do_trap_stage2_abort_guest() is using registers that
> > > > are
> > > > not context switched (such as FAR_EL2). I/O handling is split in two
> > > > with
> > > > likely a context switch in the middle. The second part is the completion
> > > > (i.e call to handle_mmio()). So the system registers will be incorrect.
> > > >      * A big chunk of do_trap_stage2_abort_guest() is not necessary for
> > > > the
> > > > completion. For instance, there is no need to try to translate the guest
> > > > virtual address to a guest physical address.
> > > > 
> > > > Therefore the version below is probably the best approach.
> > > 
> > > Indeed, the first version (with calling do_trap_stage2_abort_guest() for a
> > > completion) is a racy. When testing it more heavily I faced an issue
> > > (sometimes) which resulted in DomU got stuck completely.
> > > 
> > > (XEN) d2v1: vGICD: bad read width 0 r11 offset 0x000f00
> > > 
> > > I didn't investigate an issue in detail, but I assumed that code in
> > > do_trap_stage2_abort_guest() caused that. This was the main reason why I
> > > decided to optimize an initial patch (and took only advance_pc).
> > > Reading Julien's answer I understand now what could happen.
> >  From your and Julien's feedback it is clear that calling
> > do_trap_stage2_abort_guest() is not possible and not a good idea.
> > 
> > 
> > The reason for my suggestion was to complete the implementation of the
> > state machine so that "RETRY" actually means "let's try again the
> > emulation" but the second time it will return "HANDLED".
> > 
> > Looking at this again, we could achieve the same goal in a better way by
> > moving the register setting from "handle_mmio" to "try_handle_mmio" and
> > also calling "try_handle_mmio" from "handle_mmio". Note that handle_mmio
> > would become almost empty like on x86.
> > 
> > 1) do_trap_stage2_abort_guest ->
> >         try_handle_mmio ->
> >              try_fwd_ioserv ->
> >                  IO_RETRY
> > 
> > 2) handle_hvm_io_completion ->
> >         handle_mmio ->
> >             try_handle_mmio ->
> >                 try_fwd_ioserv ->
> >                     IO_HANDLED
> > 
> > 
> > It is very similar to your second patch with a small change on calling
> > try_handle_mmio from handle_mmio and setting the register there. Do you
> > think that would work?
> If I understood correctly what you had suggested and properly implemented then
> it works, at least I didn't face any issues during testing.
> I think this variant adds some extra operations comparing to the previous one
> (for example an attempt to find a mmio handler at try_handle_mmio). But, if
> you think new variant is cleaner and better represents how the state machine
> should look like, I would be absolutely OK to take this variant for non-RFC
> series. Please note, there was a request to move try_fwd_ioserv() to
> arm/ioreq.c (I am going to move new handle_ioserv() as well).
 
Yes, I think this version better represents the state machine, thanks
for looking into it. I think it is good.

In terms of number of operations, it doesn't look very concerning (in
the sense that it doesn't seem to add that many ops.) However we could
maybe improve it by passing a reference to the right mmio handler from
handle_mmio to try_handle_mmio if we have it. Or maybe we could save a
reference to the mmio handler as part of v->arch.hvm.hvm_io.io_req.

In any case, I think it is fine.

 
> ---
>  xen/arch/arm/io.c    | 47 +++++++++++++++++++++++++++++++++++++++++++----
>  xen/arch/arm/ioreq.c | 38 +++++++-------------------------------
>  xen/arch/arm/traps.c |  4 +++-
>  3 files changed, 53 insertions(+), 36 deletions(-)
> 
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 436f669..4db7c55 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -109,6 +109,43 @@ static const struct mmio_handler
> *find_mmio_handler(struct domain *d,
>  }
> 
>  #ifdef CONFIG_IOREQ_SERVER
> +static enum io_state handle_ioserv(struct cpu_user_regs *regs, struct vcpu
> *v)
> +{
> +    const union hsr hsr = { .bits = regs->hsr };
> +    const struct hsr_dabt dabt = hsr.dabt;
> +    /* Code is similar to handle_read */
> +    uint8_t size = (1 << dabt.size) * 8;
> +    register_t r = v->arch.hvm.hvm_io.io_req.data;
> +
> +    /* We are done with the IO */
> +    /* XXX: Is it the right place? */
> +    v->arch.hvm.hvm_io.io_req.state = STATE_IOREQ_NONE;
> +
> +    /* XXX: Do we need to take care of write here ? */
> +    if ( dabt.write )
> +        return IO_HANDLED;
> +
> +    /*
> +     * Sign extend if required.
> +     * Note that we expect the read handler to have zeroed the bits
> +     * outside the requested access size.
> +     */
> +    if ( dabt.sign && (r & (1UL << (size - 1))) )
> +    {
> +        /*
> +         * We are relying on register_t using the same as
> +         * an unsigned long in order to keep the 32-bit assembly
> +         * code smaller.
> +         */
> +        BUILD_BUG_ON(sizeof(register_t) != sizeof(unsigned long));
> +        r |= (~0UL) << size;
> +    }
> +
> +    set_user_reg(regs, dabt.reg, r);
> +
> +    return IO_HANDLED;
> +}
> +
>  static enum io_state try_fwd_ioserv(struct cpu_user_regs *regs,
>                                      struct vcpu *v, mmio_info_t *info)
>  {
> @@ -130,6 +167,10 @@ static enum io_state try_fwd_ioserv(struct cpu_user_regs
> *regs,
>      {
>      case STATE_IOREQ_NONE:
>          break;
> +
> +    case STATE_IORESP_READY:
> +        return IO_HANDLED;
> +
>      default:
>          printk("d%u wrong state %u\n", v->domain->domain_id,
>                 vio->io_req.state);
> @@ -156,10 +197,6 @@ static enum io_state try_fwd_ioserv(struct cpu_user_regs
> *regs,
>      else
>          vio->io_completion = HVMIO_mmio_completion;
> 
> -    /* XXX: Decide what to do */
> -    if ( rc == IO_RETRY )
> -        rc = IO_HANDLED;
> -
>      return rc;
>  }
>  #endif
> @@ -185,6 +222,8 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
> 
>  #ifdef CONFIG_IOREQ_SERVER
>          rc = try_fwd_ioserv(regs, v, &info);
> +        if ( rc == IO_HANDLED )
> +            return handle_ioserv(regs, v);
>  #endif
> 
>          return rc;
> diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
> index 8f60c41..9068b8d 100644
> --- a/xen/arch/arm/ioreq.c
> +++ b/xen/arch/arm/ioreq.c
> @@ -33,46 +33,22 @@
>  #include <public/hvm/dm_op.h>
>  #include <public/hvm/ioreq.h>
> 
> +#include <asm/traps.h>
> +
>  bool handle_mmio(void)
>  {
>      struct vcpu *v = current;
>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>      const union hsr hsr = { .bits = regs->hsr };
> -    const struct hsr_dabt dabt = hsr.dabt;
> -    /* Code is similar to handle_read */
> -    uint8_t size = (1 << dabt.size) * 8;
> -    register_t r = v->arch.hvm.hvm_io.io_req.data;
> -
> -    /* We should only be here on Guest Data Abort */
> -    ASSERT(dabt.ec == HSR_EC_DATA_ABORT_LOWER_EL);
> +    paddr_t addr = v->arch.hvm.hvm_io.io_req.addr;
> 
> -    /* We are done with the IO */
> -    /* XXX: Is it the right place? */
> -    v->arch.hvm.hvm_io.io_req.state = STATE_IOREQ_NONE;
> -
> -    /* XXX: Do we need to take care of write here ? */
> -    if ( dabt.write )
> -        return true;
> -
> -    /*
> -     * Sign extend if required.
> -     * Note that we expect the read handler to have zeroed the bits
> -     * outside the requested access size.
> -     */
> -    if ( dabt.sign && (r & (1UL << (size - 1))) )
> +    if ( try_handle_mmio(regs, hsr, addr) == IO_HANDLED )
>      {
> -        /*
> -         * We are relying on register_t using the same as
> -         * an unsigned long in order to keep the 32-bit assembly
> -         * code smaller.
> -         */
> -        BUILD_BUG_ON(sizeof(register_t) != sizeof(unsigned long));
> -        r |= (~0UL) << size;
> +        advance_pc(regs, hsr);
> +        return true;
>      }
> 
> -    set_user_reg(regs, dabt.reg, r);
> -
> -    return true;
> +    return false;
>  }
> 
>  /* Ask ioemu mapcache to invalidate mappings. */
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index ea472d1..974c744 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct
> cpu_user_regs *regs,
>              case IO_HANDLED:
>                  advance_pc(regs, hsr);
>                  return;
> +            case IO_RETRY:
> +                /* finish later */
> +                return;
>              case IO_UNHANDLED:
>                  /* IO unhandled, try another way to handle it. */
>                  break;
>              default:
> -                /* XXX: Handle IO_RETRY */
>                  ASSERT_UNREACHABLE();
>              }
>          }
> -- 
> 2.7.4
> 
> 
> 
> -- 
> Regards,
> 
> Oleksandr Tyshchenko
>
Julien Grall Aug. 13, 2020, 8:16 p.m. UTC | #41
On 13/08/2020 00:08, Stefano Stabellini wrote:
>>> It is very similar to your second patch with a small change on calling
>>> try_handle_mmio from handle_mmio and setting the register there. Do you
>>> think that would work?
>> If I understood correctly what you had suggested and properly implemented then
>> it works, at least I didn't face any issues during testing.
>> I think this variant adds some extra operations comparing to the previous one
>> (for example an attempt to find a mmio handler at try_handle_mmio). But, if
>> you think new variant is cleaner and better represents how the state machine
>> should look like, I would be absolutely OK to take this variant for non-RFC
>> series. Please note, there was a request to move try_fwd_ioserv() to
>> arm/ioreq.c (I am going to move new handle_ioserv() as well).
>   
> Yes, I think this version better represents the state machine, thanks
> for looking into it. I think it is good.
> 
> In terms of number of operations, it doesn't look very concerning (in
> the sense that it doesn't seem to add that many ops.) However we could
> maybe improve it by passing a reference to the right mmio handler from
> handle_mmio to try_handle_mmio if we have it. Or maybe we could save a
> reference to the mmio handler as part of v->arch.hvm.hvm_io.io_req.

There is no MMIO handler for the IOREQ handling. So I am not entirely 
sure what you are suggesting.

Cheers,
Julien Grall Aug. 15, 2020, 5:30 p.m. UTC | #42
Hi Oleksandr,

On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
> +static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
> +{

[...]

> +    /* Canonicalize read/write pointers to prevent their overflow. */
> +    while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
> +            qw++ < IOREQ_BUFFER_SLOT_NUM &&
> +            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
> +    {
> +        union bufioreq_pointers old = pg->ptrs, new;
> +        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
> +
> +        new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
> +        new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
> +        cmpxchg(&pg->ptrs.full, old.full, new.full);

While working on the implementation of cmpxchg(), I realized the 
operation will happen on memory shared with a the emulator.

This will need to be switched to guest_cmpxchg64() to prevent a domain 
to DoS Xen on Arm.

I looked at the rest of the IOREQ and didn't notice any other example.

Cheers,
Oleksandr Tyshchenko Aug. 16, 2020, 7:37 p.m. UTC | #43
On 15.08.20 20:30, Julien Grall wrote:
> Hi Oleksandr,

Hi Julien.


>
> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>> +static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, 
>> ioreq_t *p)
>> +{
>
> [...]
>
>> +    /* Canonicalize read/write pointers to prevent their overflow. */
>> +    while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
>> +            qw++ < IOREQ_BUFFER_SLOT_NUM &&
>> +            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
>> +    {
>> +        union bufioreq_pointers old = pg->ptrs, new;
>> +        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
>> +
>> +        new.read_pointer = old.read_pointer - n * 
>> IOREQ_BUFFER_SLOT_NUM;
>> +        new.write_pointer = old.write_pointer - n * 
>> IOREQ_BUFFER_SLOT_NUM;
>> +        cmpxchg(&pg->ptrs.full, old.full, new.full);
>
> While working on the implementation of cmpxchg(), I realized the 
> operation will happen on memory shared with a the emulator.
>
> This will need to be switched to guest_cmpxchg64() to prevent a domain 
> to DoS Xen on Arm.

Got it. I will create a separate patch for that purpose.
Oleksandr Tyshchenko Aug. 20, 2020, 7:14 p.m. UTC | #44
On 12.08.20 11:19, Julien Grall wrote:
> Hi,

Hi Julien, Stefano


>
> On 11/08/2020 23:48, Stefano Stabellini wrote:
>>> I have the impression that we disagree in what the Device Emulator 
>>> is meant to
>>> do. IHMO, the goal of the device emulator is to emulate a device in an
>>> arch-agnostic way.
>>
>> That would be great in theory but I am not sure it is achievable: if we
>> use an existing emulator like QEMU, even a single device has to fit
>> into QEMU's view of the world, which makes assumptions about host
>> bridges and apertures. It is impossible today to build QEMU in an
>> arch-agnostic way, it has to be tied to an architecture.
>
> AFAICT, the only reason QEMU cannot build be in an arch-agnostic way 
> is because of TCG. If this wasn't built then you could easily write a 
> machine that doesn't depend on the instruction set.
>
> The proof is, today, we are using QEMU x86 to serve Arm64 guest. 
> Although this is only for PV drivers.
>
>>
>> I realize we are not building this interface for QEMU specifically, but
>> even if we try to make the interface arch-agnostic, in reality the
>> emulators won't be arch-agnostic.
>
> This depends on your goal. If your goal is to write a standalone 
> emulator for a single device, then it is entirely possible to make it 
> arch-agnostic.
>
> Per above, this would even be possible if you were emulating a set of 
> devices.
>
> What I want to avoid is requiring all the emulators to contain 
> arch-specific code just because it is easier to get QEMU working on 
> Xen on Arm.
>
>> If we send a port-mapped I/O request
>> to qemu-system-aarch64 who knows what is going to happen: it is a code
>> path that it is not explicitly tested.
>
> Maybe, maybe not. To me this is mostly software issues that can easily 
> be mitigated if we do proper testing...

Could we please find a common ground on whether the PIO handling needs 
to be implemented on Arm or not? At least for the current patch series.


Below my thoughts:
 From one side I agree that emulator shouldn't contain any arch-specific 
code, yes it is hypervisor specific but it should be arch agnostic if 
possible. So PIO case should be handled.
 From other side I tend to think that it might be possible to skip PIO 
handling for the current patch series (leave it x86 specific for now as 
we do with handle_realmode_completion()).
I think nothing will prevent us from adding PIO handling later on if 
there is a real need (use-case) for that. Please correct me if I am wrong.

I would be absolutely OK with any options.

What do you think?
Stefano Stabellini Aug. 21, 2020, 12:53 a.m. UTC | #45
On Thu, 20 Aug 2020, Oleksandr wrote:
> > On 11/08/2020 23:48, Stefano Stabellini wrote:
> > > > I have the impression that we disagree in what the Device Emulator is
> > > > meant to
> > > > do. IHMO, the goal of the device emulator is to emulate a device in an
> > > > arch-agnostic way.
> > > 
> > > That would be great in theory but I am not sure it is achievable: if we
> > > use an existing emulator like QEMU, even a single device has to fit
> > > into QEMU's view of the world, which makes assumptions about host
> > > bridges and apertures. It is impossible today to build QEMU in an
> > > arch-agnostic way, it has to be tied to an architecture.
> > 
> > AFAICT, the only reason QEMU cannot build be in an arch-agnostic way is
> > because of TCG. If this wasn't built then you could easily write a machine
> > that doesn't depend on the instruction set.
> > 
> > The proof is, today, we are using QEMU x86 to serve Arm64 guest. Although
> > this is only for PV drivers.
> > 
> > > 
> > > I realize we are not building this interface for QEMU specifically, but
> > > even if we try to make the interface arch-agnostic, in reality the
> > > emulators won't be arch-agnostic.
> > 
> > This depends on your goal. If your goal is to write a standalone emulator
> > for a single device, then it is entirely possible to make it arch-agnostic.
> > 
> > Per above, this would even be possible if you were emulating a set of
> > devices.
> > 
> > What I want to avoid is requiring all the emulators to contain arch-specific
> > code just because it is easier to get QEMU working on Xen on Arm.
> > 
> > > If we send a port-mapped I/O request
> > > to qemu-system-aarch64 who knows what is going to happen: it is a code
> > > path that it is not explicitly tested.
> > 
> > Maybe, maybe not. To me this is mostly software issues that can easily be
> > mitigated if we do proper testing...
> 
> Could we please find a common ground on whether the PIO handling needs to be
> implemented on Arm or not? At least for the current patch series.

Can you do a test on QEMU to verify which address space the PIO BARs are
using on ARM? I don't know if there is an easy way to test it but it
would be very useful for this conversation.


> Below my thoughts:
> From one side I agree that emulator shouldn't contain any arch-specific code,
> yes it is hypervisor specific but it should be arch agnostic if possible. So
> PIO case should be handled.
> From other side I tend to think that it might be possible to skip PIO handling
> for the current patch series (leave it x86 specific for now as we do with
> handle_realmode_completion()).
> I think nothing will prevent us from adding PIO handling later on if there is
> a real need (use-case) for that. Please correct me if I am wrong.
> 
> I would be absolutely OK with any options.
> 
> What do you think?

I agree that PIO handling is not the most critical thing right now given
that we have quite a few other important TODOs in the series. I'd be
fine reviewing another version of the series with this issue still
pending.


Of course, PIO needs to be handled. The key to me is that QEMU (or other
emulator) should *not* emulate in/out instructions on ARM. PIO ioreq
requests should not be satisfied by using address_space_io directly (the
PIO address space that requires special instructions to access it). In
QEMU the PIO reads/writes should be done via address_space_memory (the
normal memory mapped address space).

So either way of the following approaches should be OK:

1) Xen sends out PIO addresses as memory mapped addresses, QEMU simply
   reads/writes on them
2) Xen sends out PIO addresses as address_space_io, QEMU finds the
   mapping to address_space_memory, then reads/writes on
   address_space_memory

From an interface and implementation perspective, 1) means that
IOREQ_TYPE_PIO is unused on ARM, while 2) means that IOREQ_TYPE_PIO is
still used as part of the ioreq interface, even if QEMU doesn't directly
operate on those addresses.

My preference is 1) because it leads to a simpler solution.
Julien Grall Aug. 21, 2020, 6:54 p.m. UTC | #46
Hi Stefano,

On 21/08/2020 01:53, Stefano Stabellini wrote:
> On Thu, 20 Aug 2020, Oleksandr wrote:
>>> On 11/08/2020 23:48, Stefano Stabellini wrote:
>>>>> I have the impression that we disagree in what the Device Emulator is
>>>>> meant to
>>>>> do. IHMO, the goal of the device emulator is to emulate a device in an
>>>>> arch-agnostic way.
>>>>
>>>> That would be great in theory but I am not sure it is achievable: if we
>>>> use an existing emulator like QEMU, even a single device has to fit
>>>> into QEMU's view of the world, which makes assumptions about host
>>>> bridges and apertures. It is impossible today to build QEMU in an
>>>> arch-agnostic way, it has to be tied to an architecture.
>>>
>>> AFAICT, the only reason QEMU cannot build be in an arch-agnostic way is
>>> because of TCG. If this wasn't built then you could easily write a machine
>>> that doesn't depend on the instruction set.
>>>
>>> The proof is, today, we are using QEMU x86 to serve Arm64 guest. Although
>>> this is only for PV drivers.
>>>
>>>>
>>>> I realize we are not building this interface for QEMU specifically, but
>>>> even if we try to make the interface arch-agnostic, in reality the
>>>> emulators won't be arch-agnostic.
>>>
>>> This depends on your goal. If your goal is to write a standalone emulator
>>> for a single device, then it is entirely possible to make it arch-agnostic.
>>>
>>> Per above, this would even be possible if you were emulating a set of
>>> devices.
>>>
>>> What I want to avoid is requiring all the emulators to contain arch-specific
>>> code just because it is easier to get QEMU working on Xen on Arm.
>>>
>>>> If we send a port-mapped I/O request
>>>> to qemu-system-aarch64 who knows what is going to happen: it is a code
>>>> path that it is not explicitly tested.
>>>
>>> Maybe, maybe not. To me this is mostly software issues that can easily be
>>> mitigated if we do proper testing...
>>
>> Could we please find a common ground on whether the PIO handling needs to be
>> implemented on Arm or not? At least for the current patch series.
> 
> Can you do a test on QEMU to verify which address space the PIO BARs are
> using on ARM? I don't know if there is an easy way to test it but it
> would be very useful for this conversation.

This is basically configured by the machine itself. See create_pcie() in 
hw/arm/virt.c.

So the hostcontroller is basically unaware that a MMIO region will be 
used instead of a PIO region.

> 
> 
>> Below my thoughts:
>>  From one side I agree that emulator shouldn't contain any arch-specific code,
>> yes it is hypervisor specific but it should be arch agnostic if possible. So
>> PIO case should be handled.
>>  From other side I tend to think that it might be possible to skip PIO handling
>> for the current patch series (leave it x86 specific for now as we do with
>> handle_realmode_completion()).
>> I think nothing will prevent us from adding PIO handling later on if there is
>> a real need (use-case) for that. Please correct me if I am wrong.
>>
>> I would be absolutely OK with any options.
>>
>> What do you think?
> 
> I agree that PIO handling is not the most critical thing right now given
> that we have quite a few other important TODOs in the series. I'd be
> fine reviewing another version of the series with this issue still
> pending.

For Arm64, the main user will be PCI. So this can be delayed until we 
add support for vPCI.

> 
> 
> Of course, PIO needs to be handled. The key to me is that QEMU (or other
> emulator) should *not* emulate in/out instructions on ARM.

I don't think anyone here suggested that we would emulate in/out 
instructions on Arm. There is actually none of them.

>  PIO ioreq
> requests should not be satisfied by using address_space_io directly (the
> PIO address space that requires special instructions to access it). In
> QEMU the PIO reads/writes should be done via address_space_memory (the
> normal memory mapped address space).
> 
> So either way of the following approaches should be OK:
> 
> 1) Xen sends out PIO addresses as memory mapped addresses, QEMU simply
>     reads/writes on them
> 2) Xen sends out PIO addresses as address_space_io, QEMU finds the
>     mapping to address_space_memory, then reads/writes on
>     address_space_memory
> 
>  From an interface and implementation perspective, 1) means that
> IOREQ_TYPE_PIO is unused on ARM, while 2) means that IOREQ_TYPE_PIO is
> still used as part of the ioreq interface, even if QEMU doesn't directly
> operate on those addresses.
> 
> My preference is 1) because it leads to a simpler solution.

"simpler" is actually very subjective :). So maybe you can clarify some 
of my concerns with this approach.

One part that have been barely discussed is the configuration part.

The discussion below is based on having the virtual PCI hostbridges 
implemented in Xen.

In the case of MMIO BAR, then the emulator doesn't need to know where is 
aperture in advance. This is because the BAR will contain the an 
absolute MMIO address so it can configure the trap correctly.

In the case of PIO BAR, from my understanding, the BAR will contain a 
relative offset from the base of PIO aperture. So the emulator needs to 
know the base address of the PIO aperture. How do you plan to pass the 
information to emulator? How about the case where there are multiple 
hostbridges?

Furthermore, most of the discussions has been focused towards device 
model that will provide emulation for all your devices (e.g. QEMU). 
However, I think this is going to be less common of device model that 
will emulate a single device (e.g. DEMU). This fits more in the 
disaggregation model.

An emulator for a single PCI device is basically the same as a real PCI 
device. Do you agree with that?

The HW engineer designing the PCI device doesn't need to know about the 
architecture. It just needs to understand the interface with the 
hostbridge. The hostbridge will then take care of the differences with 
the architecture.

A developper should really be able to do the same with the emulator. 
I.e. write it for x86 and then just recompile for Arm. With your 
approach, he/she would have to understand how the architecture works.

I still don't quite understand why we are trying to differ here. Why 
would our hostbridge implementation would not abstract the same way as a 
real one does? Can you clarify it?

Maybe the problem is just the naming issue?

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index a636a4b..f5a9f87 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -91,6 +91,7 @@  config PV_LINEAR_PT
 
 config HVM
 	def_bool !PV_SHIM_EXCLUSIVE
+	select IOREQ_SERVER
 	prompt "HVM support"
 	---help---
 	  Interfaces to support HVM domains.  HVM domains require hardware
diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c
index e3f8451..70adb27 100644
--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -16,13 +16,13 @@ 
 
 #include <xen/event.h>
 #include <xen/guest_access.h>
+#include <xen/hvm/ioreq.h>
 #include <xen/hypercall.h>
 #include <xen/nospec.h>
 #include <xen/sched.h>
 
 #include <asm/hap.h>
 #include <asm/hvm/cacheattr.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/shadow.h>
 
 #include <xsm/xsm.h>
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 8b4e73a..78993b3 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -9,6 +9,7 @@ 
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
@@ -20,7 +21,6 @@ 
 #include <asm/xstate.h>
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/hvm/monitor.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/support.h>
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 5bb4758..c05025d 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -19,6 +19,7 @@ 
  */
 
 #include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -64,7 +65,6 @@ 
 #include <asm/hvm/trace.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/monitor.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/hvm/viridian.h>
 #include <asm/hvm/vm_event.h>
 #include <asm/altp2m.h>
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 724ab44..5d501d1 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -18,6 +18,7 @@ 
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/lib.h>
@@ -35,7 +36,6 @@ 
 #include <asm/shadow.h>
 #include <asm/p2m.h>
 #include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/vpic.h>
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 7240070..dd21e85 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -17,6 +17,7 @@ 
  */
 
 #include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -28,1069 +29,16 @@ 
 #include <xen/paging.h>
 #include <xen/vpci.h>
 
-#include <asm/hvm/emulate.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
-#include <asm/hvm/vmx/vmx.h>
-
-#include <public/hvm/ioreq.h>
-#include <public/hvm/params.h>
-
-static void set_ioreq_server(struct domain *d, unsigned int id,
-                             struct hvm_ioreq_server *s)
-{
-    ASSERT(id < MAX_NR_IOREQ_SERVERS);
-    ASSERT(!s || !d->arch.hvm.ioreq_server.server[id]);
-
-    d->arch.hvm.ioreq_server.server[id] = s;
-}
-
-#define GET_IOREQ_SERVER(d, id) \
-    (d)->arch.hvm.ioreq_server.server[id]
-
-static struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
-                                                 unsigned int id)
-{
-    if ( id >= MAX_NR_IOREQ_SERVERS )
-        return NULL;
-
-    return GET_IOREQ_SERVER(d, id);
-}
-
-/*
- * Iterate over all possible ioreq servers.
- *
- * NOTE: The iteration is backwards such that more recently created
- *       ioreq servers are favoured in hvm_select_ioreq_server().
- *       This is a semantic that previously existed when ioreq servers
- *       were held in a linked list.
- */
-#define FOR_EACH_IOREQ_SERVER(d, id, s) \
-    for ( (id) = MAX_NR_IOREQ_SERVERS; (id) != 0; ) \
-        if ( !(s = GET_IOREQ_SERVER(d, --(id))) ) \
-            continue; \
-        else
-
-static ioreq_t *get_ioreq(struct hvm_ioreq_server *s, struct vcpu *v)
-{
-    shared_iopage_t *p = s->ioreq.va;
-
-    ASSERT((v == current) || !vcpu_runnable(v));
-    ASSERT(p != NULL);
-
-    return &p->vcpu_ioreq[v->vcpu_id];
-}
-
-bool hvm_io_pending(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct hvm_ioreq_server *s;
-    unsigned int id;
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        struct hvm_ioreq_vcpu *sv;
-
-        list_for_each_entry ( sv,
-                              &s->ioreq_vcpu_list,
-                              list_entry )
-        {
-            if ( sv->vcpu == v && sv->pending )
-                return true;
-        }
-    }
-
-    return false;
-}
-
-static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
-{
-    struct vcpu *v = sv->vcpu;
-    ioreq_t *ioreq = &v->arch.hvm.hvm_io.io_req;
-
-    if ( hvm_ioreq_needs_completion(ioreq) )
-        ioreq->data = data;
-
-    sv->pending = false;
-}
-
-static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
-{
-    unsigned int prev_state = STATE_IOREQ_NONE;
-
-    while ( sv->pending )
-    {
-        unsigned int state = p->state;
-
-        smp_rmb();
-
-    recheck:
-        if ( unlikely(state == STATE_IOREQ_NONE) )
-        {
-            /*
-             * The only reason we should see this case is when an
-             * emulator is dying and it races with an I/O being
-             * requested.
-             */
-            hvm_io_assist(sv, ~0ul);
-            break;
-        }
-
-        if ( unlikely(state < prev_state) )
-        {
-            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
-                     prev_state, state);
-            sv->pending = false;
-            domain_crash(sv->vcpu->domain);
-            return false; /* bail */
-        }
-
-        switch ( prev_state = state )
-        {
-        case STATE_IORESP_READY: /* IORESP_READY -> NONE */
-            p->state = STATE_IOREQ_NONE;
-            hvm_io_assist(sv, p->data);
-            break;
-        case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
-        case STATE_IOREQ_INPROCESS:
-            wait_on_xen_event_channel(sv->ioreq_evtchn,
-                                      ({ state = p->state;
-                                         smp_rmb();
-                                         state != prev_state; }));
-            goto recheck;
-        default:
-            gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
-            sv->pending = false;
-            domain_crash(sv->vcpu->domain);
-            return false; /* bail */
-        }
-    }
-
-    return true;
-}
-
-bool handle_hvm_io_completion(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct hvm_vcpu_io *vio = &v->arch.hvm.hvm_io;
-    struct hvm_ioreq_server *s;
-    enum hvm_io_completion io_completion;
-    unsigned int id;
-
-    if ( has_vpci(d) && vpci_process_pending(v) )
-    {
-        raise_softirq(SCHEDULE_SOFTIRQ);
-        return false;
-    }
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        struct hvm_ioreq_vcpu *sv;
-
-        list_for_each_entry ( sv,
-                              &s->ioreq_vcpu_list,
-                              list_entry )
-        {
-            if ( sv->vcpu == v && sv->pending )
-            {
-                if ( !hvm_wait_for_io(sv, get_ioreq(s, v)) )
-                    return false;
-
-                break;
-            }
-        }
-    }
-
-    vio->io_req.state = hvm_ioreq_needs_completion(&vio->io_req) ?
-        STATE_IORESP_READY : STATE_IOREQ_NONE;
-
-    msix_write_completion(v);
-    vcpu_end_shutdown_deferral(v);
-
-    io_completion = vio->io_completion;
-    vio->io_completion = HVMIO_no_completion;
-
-    switch ( io_completion )
-    {
-    case HVMIO_no_completion:
-        break;
-
-    case HVMIO_mmio_completion:
-        return handle_mmio();
-
-    case HVMIO_pio_completion:
-        return handle_pio(vio->io_req.addr, vio->io_req.size,
-                          vio->io_req.dir);
-
-    case HVMIO_realmode_completion:
-    {
-        struct hvm_emulate_ctxt ctxt;
-
-        hvm_emulate_init_once(&ctxt, NULL, guest_cpu_user_regs());
-        vmx_realmode_emulate_one(&ctxt);
-        hvm_emulate_writeback(&ctxt);
-
-        break;
-    }
-    default:
-        ASSERT_UNREACHABLE();
-        break;
-    }
-
-    return true;
-}
-
-static gfn_t hvm_alloc_legacy_ioreq_gfn(struct hvm_ioreq_server *s)
-{
-    struct domain *d = s->target;
-    unsigned int i;
-
-    BUILD_BUG_ON(HVM_PARAM_BUFIOREQ_PFN != HVM_PARAM_IOREQ_PFN + 1);
-
-    for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
-    {
-        if ( !test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask) )
-            return _gfn(d->arch.hvm.params[i]);
-    }
-
-    return INVALID_GFN;
-}
-
-static gfn_t hvm_alloc_ioreq_gfn(struct hvm_ioreq_server *s)
-{
-    struct domain *d = s->target;
-    unsigned int i;
-
-    for ( i = 0; i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8; i++ )
-    {
-        if ( test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.mask) )
-            return _gfn(d->arch.hvm.ioreq_gfn.base + i);
-    }
-
-    /*
-     * If we are out of 'normal' GFNs then we may still have a 'legacy'
-     * GFN available.
-     */
-    return hvm_alloc_legacy_ioreq_gfn(s);
-}
-
-static bool hvm_free_legacy_ioreq_gfn(struct hvm_ioreq_server *s,
-                                      gfn_t gfn)
-{
-    struct domain *d = s->target;
-    unsigned int i;
-
-    for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
-    {
-        if ( gfn_eq(gfn, _gfn(d->arch.hvm.params[i])) )
-             break;
-    }
-    if ( i > HVM_PARAM_BUFIOREQ_PFN )
-        return false;
-
-    set_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask);
-    return true;
-}
-
-static void hvm_free_ioreq_gfn(struct hvm_ioreq_server *s, gfn_t gfn)
-{
-    struct domain *d = s->target;
-    unsigned int i = gfn_x(gfn) - d->arch.hvm.ioreq_gfn.base;
-
-    ASSERT(!gfn_eq(gfn, INVALID_GFN));
-
-    if ( !hvm_free_legacy_ioreq_gfn(s, gfn) )
-    {
-        ASSERT(i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8);
-        set_bit(i, &d->arch.hvm.ioreq_gfn.mask);
-    }
-}
-
-static void hvm_unmap_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return;
-
-    destroy_ring_for_helper(&iorp->va, iorp->page);
-    iorp->page = NULL;
-
-    hvm_free_ioreq_gfn(s, iorp->gfn);
-    iorp->gfn = INVALID_GFN;
-}
-
-static int hvm_map_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
-    struct domain *d = s->target;
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    int rc;
-
-    if ( iorp->page )
-    {
-        /*
-         * If a page has already been allocated (which will happen on
-         * demand if hvm_get_ioreq_server_frame() is called), then
-         * mapping a guest frame is not permitted.
-         */
-        if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-            return -EPERM;
-
-        return 0;
-    }
-
-    if ( d->is_dying )
-        return -EINVAL;
-
-    iorp->gfn = hvm_alloc_ioreq_gfn(s);
-
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return -ENOMEM;
-
-    rc = prepare_ring_for_helper(d, gfn_x(iorp->gfn), &iorp->page,
-                                 &iorp->va);
-
-    if ( rc )
-        hvm_unmap_ioreq_gfn(s, buf);
-
-    return rc;
-}
-
-static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
-{
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    struct page_info *page;
-
-    if ( iorp->page )
-    {
-        /*
-         * If a guest frame has already been mapped (which may happen
-         * on demand if hvm_get_ioreq_server_info() is called), then
-         * allocating a page is not permitted.
-         */
-        if ( !gfn_eq(iorp->gfn, INVALID_GFN) )
-            return -EPERM;
-
-        return 0;
-    }
-
-    page = alloc_domheap_page(s->target, MEMF_no_refcount);
-
-    if ( !page )
-        return -ENOMEM;
-
-    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
-    {
-        /*
-         * The domain can't possibly know about this page yet, so failure
-         * here is a clear indication of something fishy going on.
-         */
-        domain_crash(s->emulator);
-        return -ENODATA;
-    }
-
-    iorp->va = __map_domain_page_global(page);
-    if ( !iorp->va )
-        goto fail;
-
-    iorp->page = page;
-    clear_page(iorp->va);
-    return 0;
-
- fail:
-    put_page_alloc_ref(page);
-    put_page_and_type(page);
-
-    return -ENOMEM;
-}
-
-static void hvm_free_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
-{
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    struct page_info *page = iorp->page;
-
-    if ( !page )
-        return;
-
-    iorp->page = NULL;
-
-    unmap_domain_page_global(iorp->va);
-    iorp->va = NULL;
-
-    put_page_alloc_ref(page);
-    put_page_and_type(page);
-}
-
-bool is_ioreq_server_page(struct domain *d, const struct page_info *page)
-{
-    const struct hvm_ioreq_server *s;
-    unsigned int id;
-    bool found = false;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        if ( (s->ioreq.page == page) || (s->bufioreq.page == page) )
-        {
-            found = true;
-            break;
-        }
-    }
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return found;
-}
-
-static void hvm_remove_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-
-{
-    struct domain *d = s->target;
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return;
-
-    if ( guest_physmap_remove_page(d, iorp->gfn,
-                                   page_to_mfn(iorp->page), 0) )
-        domain_crash(d);
-    clear_page(iorp->va);
-}
-
-static int hvm_add_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
-    struct domain *d = s->target;
-    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    int rc;
-
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return 0;
-
-    clear_page(iorp->va);
-
-    rc = guest_physmap_add_page(d, iorp->gfn,
-                                page_to_mfn(iorp->page), 0);
-    if ( rc == 0 )
-        paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn)));
-
-    return rc;
-}
-
-static void hvm_update_ioreq_evtchn(struct hvm_ioreq_server *s,
-                                    struct hvm_ioreq_vcpu *sv)
-{
-    ASSERT(spin_is_locked(&s->lock));
-
-    if ( s->ioreq.va != NULL )
-    {
-        ioreq_t *p = get_ioreq(s, sv->vcpu);
-
-        p->vp_eport = sv->ioreq_evtchn;
-    }
-}
-
-#define HANDLE_BUFIOREQ(s) \
-    ((s)->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF)
-
-static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
-                                     struct vcpu *v)
-{
-    struct hvm_ioreq_vcpu *sv;
-    int rc;
-
-    sv = xzalloc(struct hvm_ioreq_vcpu);
-
-    rc = -ENOMEM;
-    if ( !sv )
-        goto fail1;
-
-    spin_lock(&s->lock);
-
-    rc = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id,
-                                         s->emulator->domain_id, NULL);
-    if ( rc < 0 )
-        goto fail2;
-
-    sv->ioreq_evtchn = rc;
-
-    if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
-    {
-        rc = alloc_unbound_xen_event_channel(v->domain, 0,
-                                             s->emulator->domain_id, NULL);
-        if ( rc < 0 )
-            goto fail3;
-
-        s->bufioreq_evtchn = rc;
-    }
-
-    sv->vcpu = v;
-
-    list_add(&sv->list_entry, &s->ioreq_vcpu_list);
-
-    if ( s->enabled )
-        hvm_update_ioreq_evtchn(s, sv);
-
-    spin_unlock(&s->lock);
-    return 0;
-
- fail3:
-    free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
- fail2:
-    spin_unlock(&s->lock);
-    xfree(sv);
-
- fail1:
-    return rc;
-}
-
-static void hvm_ioreq_server_remove_vcpu(struct hvm_ioreq_server *s,
-                                         struct vcpu *v)
-{
-    struct hvm_ioreq_vcpu *sv;
-
-    spin_lock(&s->lock);
-
-    list_for_each_entry ( sv,
-                          &s->ioreq_vcpu_list,
-                          list_entry )
-    {
-        if ( sv->vcpu != v )
-            continue;
-
-        list_del(&sv->list_entry);
-
-        if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
-            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
-
-        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
-        xfree(sv);
-        break;
-    }
-
-    spin_unlock(&s->lock);
-}
-
-static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
-{
-    struct hvm_ioreq_vcpu *sv, *next;
-
-    spin_lock(&s->lock);
-
-    list_for_each_entry_safe ( sv,
-                               next,
-                               &s->ioreq_vcpu_list,
-                               list_entry )
-    {
-        struct vcpu *v = sv->vcpu;
-
-        list_del(&sv->list_entry);
-
-        if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
-            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
-
-        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
-        xfree(sv);
-    }
-
-    spin_unlock(&s->lock);
-}
-
-static int hvm_ioreq_server_map_pages(struct hvm_ioreq_server *s)
-{
-    int rc;
-
-    rc = hvm_map_ioreq_gfn(s, false);
-
-    if ( !rc && HANDLE_BUFIOREQ(s) )
-        rc = hvm_map_ioreq_gfn(s, true);
-
-    if ( rc )
-        hvm_unmap_ioreq_gfn(s, false);
-
-    return rc;
-}
-
-static void hvm_ioreq_server_unmap_pages(struct hvm_ioreq_server *s)
-{
-    hvm_unmap_ioreq_gfn(s, true);
-    hvm_unmap_ioreq_gfn(s, false);
-}
-
-static int hvm_ioreq_server_alloc_pages(struct hvm_ioreq_server *s)
-{
-    int rc;
-
-    rc = hvm_alloc_ioreq_mfn(s, false);
-
-    if ( !rc && (s->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF) )
-        rc = hvm_alloc_ioreq_mfn(s, true);
-
-    if ( rc )
-        hvm_free_ioreq_mfn(s, false);
-
-    return rc;
-}
-
-static void hvm_ioreq_server_free_pages(struct hvm_ioreq_server *s)
-{
-    hvm_free_ioreq_mfn(s, true);
-    hvm_free_ioreq_mfn(s, false);
-}
-
-static void hvm_ioreq_server_free_rangesets(struct hvm_ioreq_server *s)
-{
-    unsigned int i;
-
-    for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
-        rangeset_destroy(s->range[i]);
-}
-
-static int hvm_ioreq_server_alloc_rangesets(struct hvm_ioreq_server *s,
-                                            ioservid_t id)
-{
-    unsigned int i;
-    int rc;
-
-    for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
-    {
-        char *name;
-
-        rc = asprintf(&name, "ioreq_server %d %s", id,
-                      (i == XEN_DMOP_IO_RANGE_PORT) ? "port" :
-                      (i == XEN_DMOP_IO_RANGE_MEMORY) ? "memory" :
-                      (i == XEN_DMOP_IO_RANGE_PCI) ? "pci" :
-                      "");
-        if ( rc )
-            goto fail;
-
-        s->range[i] = rangeset_new(s->target, name,
-                                   RANGESETF_prettyprint_hex);
-
-        xfree(name);
-
-        rc = -ENOMEM;
-        if ( !s->range[i] )
-            goto fail;
-
-        rangeset_limit(s->range[i], MAX_NR_IO_RANGES);
-    }
-
-    return 0;
-
- fail:
-    hvm_ioreq_server_free_rangesets(s);
-
-    return rc;
-}
-
-static void hvm_ioreq_server_enable(struct hvm_ioreq_server *s)
-{
-    struct hvm_ioreq_vcpu *sv;
-
-    spin_lock(&s->lock);
-
-    if ( s->enabled )
-        goto done;
-
-    hvm_remove_ioreq_gfn(s, false);
-    hvm_remove_ioreq_gfn(s, true);
-
-    s->enabled = true;
-
-    list_for_each_entry ( sv,
-                          &s->ioreq_vcpu_list,
-                          list_entry )
-        hvm_update_ioreq_evtchn(s, sv);
-
-  done:
-    spin_unlock(&s->lock);
-}
-
-static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s)
-{
-    spin_lock(&s->lock);
-
-    if ( !s->enabled )
-        goto done;
-
-    hvm_add_ioreq_gfn(s, true);
-    hvm_add_ioreq_gfn(s, false);
-
-    s->enabled = false;
-
- done:
-    spin_unlock(&s->lock);
-}
-
-static int hvm_ioreq_server_init(struct hvm_ioreq_server *s,
-                                 struct domain *d, int bufioreq_handling,
-                                 ioservid_t id)
-{
-    struct domain *currd = current->domain;
-    struct vcpu *v;
-    int rc;
-
-    s->target = d;
-
-    get_knownalive_domain(currd);
-    s->emulator = currd;
-
-    spin_lock_init(&s->lock);
-    INIT_LIST_HEAD(&s->ioreq_vcpu_list);
-    spin_lock_init(&s->bufioreq_lock);
-
-    s->ioreq.gfn = INVALID_GFN;
-    s->bufioreq.gfn = INVALID_GFN;
-
-    rc = hvm_ioreq_server_alloc_rangesets(s, id);
-    if ( rc )
-        return rc;
-
-    s->bufioreq_handling = bufioreq_handling;
-
-    for_each_vcpu ( d, v )
-    {
-        rc = hvm_ioreq_server_add_vcpu(s, v);
-        if ( rc )
-            goto fail_add;
-    }
-
-    return 0;
-
- fail_add:
-    hvm_ioreq_server_remove_all_vcpus(s);
-    hvm_ioreq_server_unmap_pages(s);
-
-    hvm_ioreq_server_free_rangesets(s);
-
-    put_domain(s->emulator);
-    return rc;
-}
-
-static void hvm_ioreq_server_deinit(struct hvm_ioreq_server *s)
-{
-    ASSERT(!s->enabled);
-    hvm_ioreq_server_remove_all_vcpus(s);
-
-    /*
-     * NOTE: It is safe to call both hvm_ioreq_server_unmap_pages() and
-     *       hvm_ioreq_server_free_pages() in that order.
-     *       This is because the former will do nothing if the pages
-     *       are not mapped, leaving the page to be freed by the latter.
-     *       However if the pages are mapped then the former will set
-     *       the page_info pointer to NULL, meaning the latter will do
-     *       nothing.
-     */
-    hvm_ioreq_server_unmap_pages(s);
-    hvm_ioreq_server_free_pages(s);
-
-    hvm_ioreq_server_free_rangesets(s);
-
-    put_domain(s->emulator);
-}
-
-int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
-                            ioservid_t *id)
-{
-    struct hvm_ioreq_server *s;
-    unsigned int i;
-    int rc;
-
-    if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
-        return -EINVAL;
-
-    s = xzalloc(struct hvm_ioreq_server);
-    if ( !s )
-        return -ENOMEM;
-
-    domain_pause(d);
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    for ( i = 0; i < MAX_NR_IOREQ_SERVERS; i++ )
-    {
-        if ( !GET_IOREQ_SERVER(d, i) )
-            break;
-    }
-
-    rc = -ENOSPC;
-    if ( i >= MAX_NR_IOREQ_SERVERS )
-        goto fail;
-
-    /*
-     * It is safe to call set_ioreq_server() prior to
-     * hvm_ioreq_server_init() since the target domain is paused.
-     */
-    set_ioreq_server(d, i, s);
-
-    rc = hvm_ioreq_server_init(s, d, bufioreq_handling, i);
-    if ( rc )
-    {
-        set_ioreq_server(d, i, NULL);
-        goto fail;
-    }
-
-    if ( id )
-        *id = i;
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-    domain_unpause(d);
-
-    return 0;
-
- fail:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-    domain_unpause(d);
-
-    xfree(s);
-    return rc;
-}
-
-int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
-{
-    struct hvm_ioreq_server *s;
-    int rc;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    domain_pause(d);
-
-    p2m_set_ioreq_server(d, 0, s);
-
-    hvm_ioreq_server_disable(s);
-
-    /*
-     * It is safe to call hvm_ioreq_server_deinit() prior to
-     * set_ioreq_server() since the target domain is paused.
-     */
-    hvm_ioreq_server_deinit(s);
-    set_ioreq_server(d, id, NULL);
-
-    domain_unpause(d);
-
-    xfree(s);
-
-    rc = 0;
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return rc;
-}
-
-int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
-                              unsigned long *ioreq_gfn,
-                              unsigned long *bufioreq_gfn,
-                              evtchn_port_t *bufioreq_port)
-{
-    struct hvm_ioreq_server *s;
-    int rc;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    if ( ioreq_gfn || bufioreq_gfn )
-    {
-        rc = hvm_ioreq_server_map_pages(s);
-        if ( rc )
-            goto out;
-    }
-
-    if ( ioreq_gfn )
-        *ioreq_gfn = gfn_x(s->ioreq.gfn);
-
-    if ( HANDLE_BUFIOREQ(s) )
-    {
-        if ( bufioreq_gfn )
-            *bufioreq_gfn = gfn_x(s->bufioreq.gfn);
-
-        if ( bufioreq_port )
-            *bufioreq_port = s->bufioreq_evtchn;
-    }
-
-    rc = 0;
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return rc;
-}
-
-int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
-                               unsigned long idx, mfn_t *mfn)
-{
-    struct hvm_ioreq_server *s;
-    int rc;
-
-    ASSERT(is_hvm_domain(d));
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    rc = hvm_ioreq_server_alloc_pages(s);
-    if ( rc )
-        goto out;
-
-    switch ( idx )
-    {
-    case XENMEM_resource_ioreq_server_frame_bufioreq:
-        rc = -ENOENT;
-        if ( !HANDLE_BUFIOREQ(s) )
-            goto out;
-
-        *mfn = page_to_mfn(s->bufioreq.page);
-        rc = 0;
-        break;
-
-    case XENMEM_resource_ioreq_server_frame_ioreq(0):
-        *mfn = page_to_mfn(s->ioreq.page);
-        rc = 0;
-        break;
-
-    default:
-        rc = -EINVAL;
-        break;
-    }
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return rc;
-}
-
-int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
-                                     uint32_t type, uint64_t start,
-                                     uint64_t end)
-{
-    struct hvm_ioreq_server *s;
-    struct rangeset *r;
-    int rc;
-
-    if ( start > end )
-        return -EINVAL;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    switch ( type )
-    {
-    case XEN_DMOP_IO_RANGE_PORT:
-    case XEN_DMOP_IO_RANGE_MEMORY:
-    case XEN_DMOP_IO_RANGE_PCI:
-        r = s->range[type];
-        break;
-
-    default:
-        r = NULL;
-        break;
-    }
-
-    rc = -EINVAL;
-    if ( !r )
-        goto out;
-
-    rc = -EEXIST;
-    if ( rangeset_overlaps_range(r, start, end) )
-        goto out;
-
-    rc = rangeset_add_range(r, start, end);
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return rc;
-}
+#include <public/hvm/ioreq.h>
+#include <public/hvm/params.h>
 
-int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
-                                         uint32_t type, uint64_t start,
-                                         uint64_t end)
+void handle_realmode_completion(void)
 {
-    struct hvm_ioreq_server *s;
-    struct rangeset *r;
-    int rc;
-
-    if ( start > end )
-        return -EINVAL;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    switch ( type )
-    {
-    case XEN_DMOP_IO_RANGE_PORT:
-    case XEN_DMOP_IO_RANGE_MEMORY:
-    case XEN_DMOP_IO_RANGE_PCI:
-        r = s->range[type];
-        break;
-
-    default:
-        r = NULL;
-        break;
-    }
-
-    rc = -EINVAL;
-    if ( !r )
-        goto out;
-
-    rc = -ENOENT;
-    if ( !rangeset_contains_range(r, start, end) )
-        goto out;
-
-    rc = rangeset_remove_range(r, start, end);
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+    struct hvm_emulate_ctxt ctxt;
 
-    return rc;
+    hvm_emulate_init_once(&ctxt, NULL, guest_cpu_user_regs());
+    vmx_realmode_emulate_one(&ctxt);
+    hvm_emulate_writeback(&ctxt);
 }
 
 /*
@@ -1141,130 +89,12 @@  int hvm_map_mem_type_to_ioreq_server(struct domain *d, ioservid_t id,
     return rc;
 }
 
-int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
-                               bool enabled)
-{
-    struct hvm_ioreq_server *s;
-    int rc;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    s = get_ioreq_server(d, id);
-
-    rc = -ENOENT;
-    if ( !s )
-        goto out;
-
-    rc = -EPERM;
-    if ( s->emulator != current->domain )
-        goto out;
-
-    domain_pause(d);
-
-    if ( enabled )
-        hvm_ioreq_server_enable(s);
-    else
-        hvm_ioreq_server_disable(s);
-
-    domain_unpause(d);
-
-    rc = 0;
-
- out:
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-    return rc;
-}
-
-int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v)
-{
-    struct hvm_ioreq_server *s;
-    unsigned int id;
-    int rc;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        rc = hvm_ioreq_server_add_vcpu(s, v);
-        if ( rc )
-            goto fail;
-    }
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return 0;
-
- fail:
-    while ( ++id != MAX_NR_IOREQ_SERVERS )
-    {
-        s = GET_IOREQ_SERVER(d, id);
-
-        if ( !s )
-            continue;
-
-        hvm_ioreq_server_remove_vcpu(s, v);
-    }
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    return rc;
-}
-
-void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v)
-{
-    struct hvm_ioreq_server *s;
-    unsigned int id;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-        hvm_ioreq_server_remove_vcpu(s, v);
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-}
-
-void hvm_destroy_all_ioreq_servers(struct domain *d)
+void hvm_get_ioreq_server_range_type(struct domain *d,
+                                     ioreq_t *p,
+                                     uint8_t *type,
+                                     uint64_t *addr)
 {
-    struct hvm_ioreq_server *s;
-    unsigned int id;
-
-    if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
-        return;
-
-    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
-    /* No need to domain_pause() as the domain is being torn down */
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        hvm_ioreq_server_disable(s);
-
-        /*
-         * It is safe to call hvm_ioreq_server_deinit() prior to
-         * set_ioreq_server() since the target domain is being destroyed.
-         */
-        hvm_ioreq_server_deinit(s);
-        set_ioreq_server(d, id, NULL);
-
-        xfree(s);
-    }
-
-    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-}
-
-struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
-                                                 ioreq_t *p)
-{
-    struct hvm_ioreq_server *s;
-    uint32_t cf8;
-    uint8_t type;
-    uint64_t addr;
-    unsigned int id;
-
-    if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
-        return NULL;
-
-    cf8 = d->arch.hvm.pci_cf8;
+    uint32_t cf8 = d->arch.hvm.pci_cf8;
 
     if ( p->type == IOREQ_TYPE_PIO &&
          (p->addr & ~3) == 0xcfc &&
@@ -1277,8 +107,8 @@  struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
         reg = hvm_pci_decode_addr(cf8, p->addr, &sbdf);
 
         /* PCI config data cycle */
-        type = XEN_DMOP_IO_RANGE_PCI;
-        addr = ((uint64_t)sbdf.sbdf << 32) | reg;
+        *type = XEN_DMOP_IO_RANGE_PCI;
+        *addr = ((uint64_t)sbdf.sbdf << 32) | reg;
         /* AMD extended configuration space access? */
         if ( CF8_ADDR_HI(cf8) &&
              d->arch.cpuid->x86_vendor == X86_VENDOR_AMD &&
@@ -1290,230 +120,15 @@  struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
 
             if ( !rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) &&
                  (msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
-                addr |= CF8_ADDR_HI(cf8);
+                *addr |= CF8_ADDR_HI(cf8);
         }
     }
     else
     {
-        type = (p->type == IOREQ_TYPE_PIO) ?
-                XEN_DMOP_IO_RANGE_PORT : XEN_DMOP_IO_RANGE_MEMORY;
-        addr = p->addr;
-    }
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        struct rangeset *r;
-
-        if ( !s->enabled )
-            continue;
-
-        r = s->range[type];
-
-        switch ( type )
-        {
-            unsigned long start, end;
-
-        case XEN_DMOP_IO_RANGE_PORT:
-            start = addr;
-            end = start + p->size - 1;
-            if ( rangeset_contains_range(r, start, end) )
-                return s;
-
-            break;
-
-        case XEN_DMOP_IO_RANGE_MEMORY:
-            start = hvm_mmio_first_byte(p);
-            end = hvm_mmio_last_byte(p);
-
-            if ( rangeset_contains_range(r, start, end) )
-                return s;
-
-            break;
-
-        case XEN_DMOP_IO_RANGE_PCI:
-            if ( rangeset_contains_singleton(r, addr >> 32) )
-            {
-                p->type = IOREQ_TYPE_PCI_CONFIG;
-                p->addr = addr;
-                return s;
-            }
-
-            break;
-        }
-    }
-
-    return NULL;
-}
-
-static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
-{
-    struct domain *d = current->domain;
-    struct hvm_ioreq_page *iorp;
-    buffered_iopage_t *pg;
-    buf_ioreq_t bp = { .data = p->data,
-                       .addr = p->addr,
-                       .type = p->type,
-                       .dir = p->dir };
-    /* Timeoffset sends 64b data, but no address. Use two consecutive slots. */
-    int qw = 0;
-
-    /* Ensure buffered_iopage fits in a page */
-    BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
-
-    iorp = &s->bufioreq;
-    pg = iorp->va;
-
-    if ( !pg )
-        return X86EMUL_UNHANDLEABLE;
-
-    /*
-     * Return 0 for the cases we can't deal with:
-     *  - 'addr' is only a 20-bit field, so we cannot address beyond 1MB
-     *  - we cannot buffer accesses to guest memory buffers, as the guest
-     *    may expect the memory buffer to be synchronously accessed
-     *  - the count field is usually used with data_is_ptr and since we don't
-     *    support data_is_ptr we do not waste space for the count field either
-     */
-    if ( (p->addr > 0xffffful) || p->data_is_ptr || (p->count != 1) )
-        return 0;
-
-    switch ( p->size )
-    {
-    case 1:
-        bp.size = 0;
-        break;
-    case 2:
-        bp.size = 1;
-        break;
-    case 4:
-        bp.size = 2;
-        break;
-    case 8:
-        bp.size = 3;
-        qw = 1;
-        break;
-    default:
-        gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size);
-        return X86EMUL_UNHANDLEABLE;
-    }
-
-    spin_lock(&s->bufioreq_lock);
-
-    if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
-         (IOREQ_BUFFER_SLOT_NUM - qw) )
-    {
-        /* The queue is full: send the iopacket through the normal path. */
-        spin_unlock(&s->bufioreq_lock);
-        return X86EMUL_UNHANDLEABLE;
-    }
-
-    pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
-
-    if ( qw )
-    {
-        bp.data = p->data >> 32;
-        pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
-    }
-
-    /* Make the ioreq_t visible /before/ write_pointer. */
-    smp_wmb();
-    pg->ptrs.write_pointer += qw ? 2 : 1;
-
-    /* Canonicalize read/write pointers to prevent their overflow. */
-    while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
-            qw++ < IOREQ_BUFFER_SLOT_NUM &&
-            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
-    {
-        union bufioreq_pointers old = pg->ptrs, new;
-        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
-
-        new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
-        new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
-        cmpxchg(&pg->ptrs.full, old.full, new.full);
-    }
-
-    notify_via_xen_event_channel(d, s->bufioreq_evtchn);
-    spin_unlock(&s->bufioreq_lock);
-
-    return X86EMUL_OKAY;
-}
-
-int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
-                   bool buffered)
-{
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
-    struct hvm_ioreq_vcpu *sv;
-
-    ASSERT(s);
-
-    if ( buffered )
-        return hvm_send_buffered_ioreq(s, proto_p);
-
-    if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
-        return X86EMUL_RETRY;
-
-    list_for_each_entry ( sv,
-                          &s->ioreq_vcpu_list,
-                          list_entry )
-    {
-        if ( sv->vcpu == curr )
-        {
-            evtchn_port_t port = sv->ioreq_evtchn;
-            ioreq_t *p = get_ioreq(s, curr);
-
-            if ( unlikely(p->state != STATE_IOREQ_NONE) )
-            {
-                gprintk(XENLOG_ERR, "device model set bad IO state %d\n",
-                        p->state);
-                break;
-            }
-
-            if ( unlikely(p->vp_eport != port) )
-            {
-                gprintk(XENLOG_ERR, "device model set bad event channel %d\n",
-                        p->vp_eport);
-                break;
-            }
-
-            proto_p->state = STATE_IOREQ_NONE;
-            proto_p->vp_eport = port;
-            *p = *proto_p;
-
-            prepare_wait_on_xen_event_channel(port);
-
-            /*
-             * Following happens /after/ blocking and setting up ioreq
-             * contents. prepare_wait_on_xen_event_channel() is an implicit
-             * barrier.
-             */
-            p->state = STATE_IOREQ_READY;
-            notify_via_xen_event_channel(d, port);
-
-            sv->pending = true;
-            return X86EMUL_RETRY;
-        }
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered)
-{
-    struct domain *d = current->domain;
-    struct hvm_ioreq_server *s;
-    unsigned int id, failed = 0;
-
-    FOR_EACH_IOREQ_SERVER(d, id, s)
-    {
-        if ( !s->enabled )
-            continue;
-
-        if ( hvm_send_ioreq(s, p, buffered) == X86EMUL_UNHANDLEABLE )
-            failed++;
+        *type = (p->type == IOREQ_TYPE_PIO) ?
+                 XEN_DMOP_IO_RANGE_PORT : XEN_DMOP_IO_RANGE_MEMORY;
+        *addr = p->addr;
     }
-
-    return failed;
 }
 
 static int hvm_access_cf8(
@@ -1528,13 +143,19 @@  static int hvm_access_cf8(
     return X86EMUL_UNHANDLEABLE;
 }
 
-void hvm_ioreq_init(struct domain *d)
+void arch_hvm_ioreq_init(struct domain *d)
 {
     spin_lock_init(&d->arch.hvm.ioreq_server.lock);
 
     register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
 }
 
+void arch_hvm_ioreq_destroy(struct domain *d)
+{
+    if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
+        return;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c
index e267513..ab6d315 100644
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -27,10 +27,10 @@ 
  *  can have side effects.
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/types.h>
 #include <xen/sched.h>
 #include <xen/domain_page.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/hvm/support.h>
 #include <xen/numa.h>
 #include <xen/paging.h>
diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c
index bdbd9cb..b804262 100644
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -9,6 +9,7 @@ 
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 7dfff6c..acfeb1c 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -18,11 +18,11 @@ 
  * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
+#include <xen/hvm/ioreq.h>
 
 #include <asm/types.h>
 #include <asm/mtrr.h>
 #include <asm/p2m.h>
-#include <asm/hvm/ioreq.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vvmx.h>
 #include <asm/hvm/nestedhvm.h>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 82bc676..2b06e15 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -99,6 +99,7 @@ 
  * doing the final put_page(), and remove it from the iommu if so.
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/init.h>
 #include <xen/kernel.h>
 #include <xen/lib.h>
@@ -141,7 +142,6 @@ 
 #include <asm/io_apic.h>
 #include <asm/pci.h>
 #include <asm/guest.h>
-#include <asm/hvm/ioreq.h>
 
 #include <asm/hvm/grant_table.h>
 #include <asm/pv/domain.h>
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 7737773..c84cbb2 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -20,6 +20,7 @@ 
  * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/hvm/ioreq.h>
 #include <xen/types.h>
 #include <xen/mm.h>
 #include <xen/trace.h>
@@ -34,7 +35,6 @@ 
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/shadow.h>
-#include <asm/hvm/ioreq.h>
 #include <xen/numa.h>
 #include "private.h"
 
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 15e3b79..fb6fb51 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -139,6 +139,9 @@  config HYPFS_CONFIG
 	  Disable this option in case you want to spare some memory or you
 	  want to hide the .config contents from dom0.
 
+config IOREQ_SERVER
+	bool
+
 config KEXEC
 	bool "kexec support"
 	default y
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 06881d0..f6fc3f8 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -70,6 +70,7 @@  extra-y := symbols-dummy.o
 
 obj-$(CONFIG_COVERAGE) += coverage/
 obj-y += sched/
+obj-$(CONFIG_IOREQ_SERVER) += hvm/
 obj-$(CONFIG_UBSAN) += ubsan/
 
 obj-$(CONFIG_NEEDS_LIBELF) += libelf/
diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
new file mode 100644
index 0000000..326215d
--- /dev/null
+++ b/xen/common/hvm/Makefile
@@ -0,0 +1 @@ 
+obj-y += ioreq.o
diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
new file mode 100644
index 0000000..7e1fa23
--- /dev/null
+++ b/xen/common/hvm/ioreq.c
@@ -0,0 +1,1430 @@ 
+/*
+ * hvm/ioreq.c: hardware virtual machine I/O emulation
+ *
+ * Copyright (c) 2016 Citrix Systems Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <xen/domain.h>
+#include <xen/domain_page.h>
+#include <xen/event.h>
+#include <xen/paging.h>
+#include <xen/vpci.h>
+
+#include <public/hvm/dm_op.h>
+#include <public/hvm/ioreq.h>
+#include <public/hvm/params.h>
+
+static void set_ioreq_server(struct domain *d, unsigned int id,
+                             struct hvm_ioreq_server *s)
+{
+    ASSERT(id < MAX_NR_IOREQ_SERVERS);
+    ASSERT(!s || !d->arch.hvm.ioreq_server.server[id]);
+
+    d->arch.hvm.ioreq_server.server[id] = s;
+}
+
+/*
+ * Iterate over all possible ioreq servers.
+ *
+ * NOTE: The iteration is backwards such that more recently created
+ *       ioreq servers are favoured in hvm_select_ioreq_server().
+ *       This is a semantic that previously existed when ioreq servers
+ *       were held in a linked list.
+ */
+#define FOR_EACH_IOREQ_SERVER(d, id, s) \
+    for ( (id) = MAX_NR_IOREQ_SERVERS; (id) != 0; ) \
+        if ( !(s = GET_IOREQ_SERVER(d, --(id))) ) \
+            continue; \
+        else
+
+static ioreq_t *get_ioreq(struct hvm_ioreq_server *s, struct vcpu *v)
+{
+    shared_iopage_t *p = s->ioreq.va;
+
+    ASSERT((v == current) || !vcpu_runnable(v));
+    ASSERT(p != NULL);
+
+    return &p->vcpu_ioreq[v->vcpu_id];
+}
+
+bool hvm_io_pending(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct hvm_ioreq_server *s;
+    unsigned int id;
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        struct hvm_ioreq_vcpu *sv;
+
+        list_for_each_entry ( sv,
+                              &s->ioreq_vcpu_list,
+                              list_entry )
+        {
+            if ( sv->vcpu == v && sv->pending )
+                return true;
+        }
+    }
+
+    return false;
+}
+
+static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
+{
+    struct vcpu *v = sv->vcpu;
+    ioreq_t *ioreq = &v->arch.hvm.hvm_io.io_req;
+
+    if ( hvm_ioreq_needs_completion(ioreq) )
+        ioreq->data = data;
+
+    sv->pending = false;
+}
+
+static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+{
+    unsigned int prev_state = STATE_IOREQ_NONE;
+
+    while ( sv->pending )
+    {
+        unsigned int state = p->state;
+
+        smp_rmb();
+
+    recheck:
+        if ( unlikely(state == STATE_IOREQ_NONE) )
+        {
+            /*
+             * The only reason we should see this case is when an
+             * emulator is dying and it races with an I/O being
+             * requested.
+             */
+            hvm_io_assist(sv, ~0ul);
+            break;
+        }
+
+        if ( unlikely(state < prev_state) )
+        {
+            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
+                     prev_state, state);
+            sv->pending = false;
+            domain_crash(sv->vcpu->domain);
+            return false; /* bail */
+        }
+
+        switch ( prev_state = state )
+        {
+        case STATE_IORESP_READY: /* IORESP_READY -> NONE */
+            p->state = STATE_IOREQ_NONE;
+            hvm_io_assist(sv, p->data);
+            break;
+        case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
+        case STATE_IOREQ_INPROCESS:
+            wait_on_xen_event_channel(sv->ioreq_evtchn,
+                                      ({ state = p->state;
+                                         smp_rmb();
+                                         state != prev_state; }));
+            goto recheck;
+        default:
+            gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
+            sv->pending = false;
+            domain_crash(sv->vcpu->domain);
+            return false; /* bail */
+        }
+    }
+
+    return true;
+}
+
+bool handle_hvm_io_completion(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct hvm_vcpu_io *vio = &v->arch.hvm.hvm_io;
+    struct hvm_ioreq_server *s;
+    enum hvm_io_completion io_completion;
+    unsigned int id;
+
+    if ( has_vpci(d) && vpci_process_pending(v) )
+    {
+        raise_softirq(SCHEDULE_SOFTIRQ);
+        return false;
+    }
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        struct hvm_ioreq_vcpu *sv;
+
+        list_for_each_entry ( sv,
+                              &s->ioreq_vcpu_list,
+                              list_entry )
+        {
+            if ( sv->vcpu == v && sv->pending )
+            {
+                if ( !hvm_wait_for_io(sv, get_ioreq(s, v)) )
+                    return false;
+
+                break;
+            }
+        }
+    }
+
+    vio->io_req.state = hvm_ioreq_needs_completion(&vio->io_req) ?
+        STATE_IORESP_READY : STATE_IOREQ_NONE;
+
+    msix_write_completion(v);
+    vcpu_end_shutdown_deferral(v);
+
+    io_completion = vio->io_completion;
+    vio->io_completion = HVMIO_no_completion;
+
+    switch ( io_completion )
+    {
+    case HVMIO_no_completion:
+        break;
+
+    case HVMIO_mmio_completion:
+        return handle_mmio();
+
+    case HVMIO_pio_completion:
+        return handle_pio(vio->io_req.addr, vio->io_req.size,
+                          vio->io_req.dir);
+
+    case HVMIO_realmode_completion:
+        handle_realmode_completion();
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        break;
+    }
+
+    return true;
+}
+
+static gfn_t hvm_alloc_legacy_ioreq_gfn(struct hvm_ioreq_server *s)
+{
+    struct domain *d = s->target;
+    unsigned int i;
+
+    BUILD_BUG_ON(HVM_PARAM_BUFIOREQ_PFN != HVM_PARAM_IOREQ_PFN + 1);
+
+    for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
+    {
+        if ( !test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask) )
+            return _gfn(d->arch.hvm.params[i]);
+    }
+
+    return INVALID_GFN;
+}
+
+static gfn_t hvm_alloc_ioreq_gfn(struct hvm_ioreq_server *s)
+{
+    struct domain *d = s->target;
+    unsigned int i;
+
+    for ( i = 0; i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8; i++ )
+    {
+        if ( test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.mask) )
+            return _gfn(d->arch.hvm.ioreq_gfn.base + i);
+    }
+
+    /*
+     * If we are out of 'normal' GFNs then we may still have a 'legacy'
+     * GFN available.
+     */
+    return hvm_alloc_legacy_ioreq_gfn(s);
+}
+
+static bool hvm_free_legacy_ioreq_gfn(struct hvm_ioreq_server *s,
+                                      gfn_t gfn)
+{
+    struct domain *d = s->target;
+    unsigned int i;
+
+    for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
+    {
+        if ( gfn_eq(gfn, _gfn(d->arch.hvm.params[i])) )
+             break;
+    }
+    if ( i > HVM_PARAM_BUFIOREQ_PFN )
+        return false;
+
+    set_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask);
+    return true;
+}
+
+static void hvm_free_ioreq_gfn(struct hvm_ioreq_server *s, gfn_t gfn)
+{
+    struct domain *d = s->target;
+    unsigned int i = gfn_x(gfn) - d->arch.hvm.ioreq_gfn.base;
+
+    ASSERT(!gfn_eq(gfn, INVALID_GFN));
+
+    if ( !hvm_free_legacy_ioreq_gfn(s, gfn) )
+    {
+        ASSERT(i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8);
+        set_bit(i, &d->arch.hvm.ioreq_gfn.mask);
+    }
+}
+
+static void hvm_unmap_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+
+    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+        return;
+
+    destroy_ring_for_helper(&iorp->va, iorp->page);
+    iorp->page = NULL;
+
+    hvm_free_ioreq_gfn(s, iorp->gfn);
+    iorp->gfn = INVALID_GFN;
+}
+
+static int hvm_map_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+    struct domain *d = s->target;
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    int rc;
+
+    if ( iorp->page )
+    {
+        /*
+         * If a page has already been allocated (which will happen on
+         * demand if hvm_get_ioreq_server_frame() is called), then
+         * mapping a guest frame is not permitted.
+         */
+        if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+            return -EPERM;
+
+        return 0;
+    }
+
+    if ( d->is_dying )
+        return -EINVAL;
+
+    iorp->gfn = hvm_alloc_ioreq_gfn(s);
+
+    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+        return -ENOMEM;
+
+    rc = prepare_ring_for_helper(d, gfn_x(iorp->gfn), &iorp->page,
+                                 &iorp->va);
+
+    if ( rc )
+        hvm_unmap_ioreq_gfn(s, buf);
+
+    return rc;
+}
+
+static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
+{
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *page;
+
+    if ( iorp->page )
+    {
+        /*
+         * If a guest frame has already been mapped (which may happen
+         * on demand if hvm_get_ioreq_server_info() is called), then
+         * allocating a page is not permitted.
+         */
+        if ( !gfn_eq(iorp->gfn, INVALID_GFN) )
+            return -EPERM;
+
+        return 0;
+    }
+
+    page = alloc_domheap_page(s->target, MEMF_no_refcount);
+
+    if ( !page )
+        return -ENOMEM;
+
+    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
+    {
+        /*
+         * The domain can't possibly know about this page yet, so failure
+         * here is a clear indication of something fishy going on.
+         */
+        domain_crash(s->emulator);
+        return -ENODATA;
+    }
+
+    iorp->va = __map_domain_page_global(page);
+    if ( !iorp->va )
+        goto fail;
+
+    iorp->page = page;
+    clear_page(iorp->va);
+    return 0;
+
+ fail:
+    put_page_alloc_ref(page);
+    put_page_and_type(page);
+
+    return -ENOMEM;
+}
+
+static void hvm_free_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
+{
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *page = iorp->page;
+
+    if ( !page )
+        return;
+
+    iorp->page = NULL;
+
+    unmap_domain_page_global(iorp->va);
+    iorp->va = NULL;
+
+    put_page_alloc_ref(page);
+    put_page_and_type(page);
+}
+
+bool is_ioreq_server_page(struct domain *d, const struct page_info *page)
+{
+    const struct hvm_ioreq_server *s;
+    unsigned int id;
+    bool found = false;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        if ( (s->ioreq.page == page) || (s->bufioreq.page == page) )
+        {
+            found = true;
+            break;
+        }
+    }
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return found;
+}
+
+static void hvm_remove_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+
+{
+    struct domain *d = s->target;
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+
+    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+        return;
+
+    if ( guest_physmap_remove_page(d, iorp->gfn,
+                                   page_to_mfn(iorp->page), 0) )
+        domain_crash(d);
+    clear_page(iorp->va);
+}
+
+static int hvm_add_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+    struct domain *d = s->target;
+    struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    int rc;
+
+    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+        return 0;
+
+    clear_page(iorp->va);
+
+    rc = guest_physmap_add_page(d, iorp->gfn,
+                                page_to_mfn(iorp->page), 0);
+    if ( rc == 0 )
+        paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn)));
+
+    return rc;
+}
+
+static void hvm_update_ioreq_evtchn(struct hvm_ioreq_server *s,
+                                    struct hvm_ioreq_vcpu *sv)
+{
+    ASSERT(spin_is_locked(&s->lock));
+
+    if ( s->ioreq.va != NULL )
+    {
+        ioreq_t *p = get_ioreq(s, sv->vcpu);
+
+        p->vp_eport = sv->ioreq_evtchn;
+    }
+}
+
+#define HANDLE_BUFIOREQ(s) \
+    ((s)->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF)
+
+static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
+                                     struct vcpu *v)
+{
+    struct hvm_ioreq_vcpu *sv;
+    int rc;
+
+    sv = xzalloc(struct hvm_ioreq_vcpu);
+
+    rc = -ENOMEM;
+    if ( !sv )
+        goto fail1;
+
+    spin_lock(&s->lock);
+
+    rc = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id,
+                                         s->emulator->domain_id, NULL);
+    if ( rc < 0 )
+        goto fail2;
+
+    sv->ioreq_evtchn = rc;
+
+    if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+    {
+        rc = alloc_unbound_xen_event_channel(v->domain, 0,
+                                             s->emulator->domain_id, NULL);
+        if ( rc < 0 )
+            goto fail3;
+
+        s->bufioreq_evtchn = rc;
+    }
+
+    sv->vcpu = v;
+
+    list_add(&sv->list_entry, &s->ioreq_vcpu_list);
+
+    if ( s->enabled )
+        hvm_update_ioreq_evtchn(s, sv);
+
+    spin_unlock(&s->lock);
+    return 0;
+
+ fail3:
+    free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+ fail2:
+    spin_unlock(&s->lock);
+    xfree(sv);
+
+ fail1:
+    return rc;
+}
+
+static void hvm_ioreq_server_remove_vcpu(struct hvm_ioreq_server *s,
+                                         struct vcpu *v)
+{
+    struct hvm_ioreq_vcpu *sv;
+
+    spin_lock(&s->lock);
+
+    list_for_each_entry ( sv,
+                          &s->ioreq_vcpu_list,
+                          list_entry )
+    {
+        if ( sv->vcpu != v )
+            continue;
+
+        list_del(&sv->list_entry);
+
+        if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
+
+        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+        xfree(sv);
+        break;
+    }
+
+    spin_unlock(&s->lock);
+}
+
+static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
+{
+    struct hvm_ioreq_vcpu *sv, *next;
+
+    spin_lock(&s->lock);
+
+    list_for_each_entry_safe ( sv,
+                               next,
+                               &s->ioreq_vcpu_list,
+                               list_entry )
+    {
+        struct vcpu *v = sv->vcpu;
+
+        list_del(&sv->list_entry);
+
+        if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
+
+        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+        xfree(sv);
+    }
+
+    spin_unlock(&s->lock);
+}
+
+static int hvm_ioreq_server_map_pages(struct hvm_ioreq_server *s)
+{
+    int rc;
+
+    rc = hvm_map_ioreq_gfn(s, false);
+
+    if ( !rc && HANDLE_BUFIOREQ(s) )
+        rc = hvm_map_ioreq_gfn(s, true);
+
+    if ( rc )
+        hvm_unmap_ioreq_gfn(s, false);
+
+    return rc;
+}
+
+static void hvm_ioreq_server_unmap_pages(struct hvm_ioreq_server *s)
+{
+    hvm_unmap_ioreq_gfn(s, true);
+    hvm_unmap_ioreq_gfn(s, false);
+}
+
+static int hvm_ioreq_server_alloc_pages(struct hvm_ioreq_server *s)
+{
+    int rc;
+
+    rc = hvm_alloc_ioreq_mfn(s, false);
+
+    if ( !rc && (s->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF) )
+        rc = hvm_alloc_ioreq_mfn(s, true);
+
+    if ( rc )
+        hvm_free_ioreq_mfn(s, false);
+
+    return rc;
+}
+
+static void hvm_ioreq_server_free_pages(struct hvm_ioreq_server *s)
+{
+    hvm_free_ioreq_mfn(s, true);
+    hvm_free_ioreq_mfn(s, false);
+}
+
+static void hvm_ioreq_server_free_rangesets(struct hvm_ioreq_server *s)
+{
+    unsigned int i;
+
+    for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
+        rangeset_destroy(s->range[i]);
+}
+
+static int hvm_ioreq_server_alloc_rangesets(struct hvm_ioreq_server *s,
+                                            ioservid_t id)
+{
+    unsigned int i;
+    int rc;
+
+    for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
+    {
+        char *name;
+
+        rc = asprintf(&name, "ioreq_server %d %s", id,
+                      (i == XEN_DMOP_IO_RANGE_PORT) ? "port" :
+                      (i == XEN_DMOP_IO_RANGE_MEMORY) ? "memory" :
+                      (i == XEN_DMOP_IO_RANGE_PCI) ? "pci" :
+                      "");
+        if ( rc )
+            goto fail;
+
+        s->range[i] = rangeset_new(s->target, name,
+                                   RANGESETF_prettyprint_hex);
+
+        xfree(name);
+
+        rc = -ENOMEM;
+        if ( !s->range[i] )
+            goto fail;
+
+        rangeset_limit(s->range[i], MAX_NR_IO_RANGES);
+    }
+
+    return 0;
+
+ fail:
+    hvm_ioreq_server_free_rangesets(s);
+
+    return rc;
+}
+
+static void hvm_ioreq_server_enable(struct hvm_ioreq_server *s)
+{
+    struct hvm_ioreq_vcpu *sv;
+
+    spin_lock(&s->lock);
+
+    if ( s->enabled )
+        goto done;
+
+    hvm_remove_ioreq_gfn(s, false);
+    hvm_remove_ioreq_gfn(s, true);
+
+    s->enabled = true;
+
+    list_for_each_entry ( sv,
+                          &s->ioreq_vcpu_list,
+                          list_entry )
+        hvm_update_ioreq_evtchn(s, sv);
+
+  done:
+    spin_unlock(&s->lock);
+}
+
+static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s)
+{
+    spin_lock(&s->lock);
+
+    if ( !s->enabled )
+        goto done;
+
+    hvm_add_ioreq_gfn(s, true);
+    hvm_add_ioreq_gfn(s, false);
+
+    s->enabled = false;
+
+ done:
+    spin_unlock(&s->lock);
+}
+
+static int hvm_ioreq_server_init(struct hvm_ioreq_server *s,
+                                 struct domain *d, int bufioreq_handling,
+                                 ioservid_t id)
+{
+    struct domain *currd = current->domain;
+    struct vcpu *v;
+    int rc;
+
+    s->target = d;
+
+    get_knownalive_domain(currd);
+    s->emulator = currd;
+
+    spin_lock_init(&s->lock);
+    INIT_LIST_HEAD(&s->ioreq_vcpu_list);
+    spin_lock_init(&s->bufioreq_lock);
+
+    s->ioreq.gfn = INVALID_GFN;
+    s->bufioreq.gfn = INVALID_GFN;
+
+    rc = hvm_ioreq_server_alloc_rangesets(s, id);
+    if ( rc )
+        return rc;
+
+    s->bufioreq_handling = bufioreq_handling;
+
+    for_each_vcpu ( d, v )
+    {
+        rc = hvm_ioreq_server_add_vcpu(s, v);
+        if ( rc )
+            goto fail_add;
+    }
+
+    return 0;
+
+ fail_add:
+    hvm_ioreq_server_remove_all_vcpus(s);
+    hvm_ioreq_server_unmap_pages(s);
+
+    hvm_ioreq_server_free_rangesets(s);
+
+    put_domain(s->emulator);
+    return rc;
+}
+
+static void hvm_ioreq_server_deinit(struct hvm_ioreq_server *s)
+{
+    ASSERT(!s->enabled);
+    hvm_ioreq_server_remove_all_vcpus(s);
+
+    /*
+     * NOTE: It is safe to call both hvm_ioreq_server_unmap_pages() and
+     *       hvm_ioreq_server_free_pages() in that order.
+     *       This is because the former will do nothing if the pages
+     *       are not mapped, leaving the page to be freed by the latter.
+     *       However if the pages are mapped then the former will set
+     *       the page_info pointer to NULL, meaning the latter will do
+     *       nothing.
+     */
+    hvm_ioreq_server_unmap_pages(s);
+    hvm_ioreq_server_free_pages(s);
+
+    hvm_ioreq_server_free_rangesets(s);
+
+    put_domain(s->emulator);
+}
+
+int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
+                            ioservid_t *id)
+{
+    struct hvm_ioreq_server *s;
+    unsigned int i;
+    int rc;
+
+    if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
+        return -EINVAL;
+
+    s = xzalloc(struct hvm_ioreq_server);
+    if ( !s )
+        return -ENOMEM;
+
+    domain_pause(d);
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    for ( i = 0; i < MAX_NR_IOREQ_SERVERS; i++ )
+    {
+        if ( !GET_IOREQ_SERVER(d, i) )
+            break;
+    }
+
+    rc = -ENOSPC;
+    if ( i >= MAX_NR_IOREQ_SERVERS )
+        goto fail;
+
+    /*
+     * It is safe to call set_ioreq_server() prior to
+     * hvm_ioreq_server_init() since the target domain is paused.
+     */
+    set_ioreq_server(d, i, s);
+
+    rc = hvm_ioreq_server_init(s, d, bufioreq_handling, i);
+    if ( rc )
+    {
+        set_ioreq_server(d, i, NULL);
+        goto fail;
+    }
+
+    if ( id )
+        *id = i;
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+    domain_unpause(d);
+
+    return 0;
+
+ fail:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+    domain_unpause(d);
+
+    xfree(s);
+    return rc;
+}
+
+int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
+{
+    struct hvm_ioreq_server *s;
+    int rc;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    domain_pause(d);
+
+    p2m_set_ioreq_server(d, 0, s);
+
+    hvm_ioreq_server_disable(s);
+
+    /*
+     * It is safe to call hvm_ioreq_server_deinit() prior to
+     * set_ioreq_server() since the target domain is paused.
+     */
+    hvm_ioreq_server_deinit(s);
+    set_ioreq_server(d, id, NULL);
+
+    domain_unpause(d);
+
+    xfree(s);
+
+    rc = 0;
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
+                              unsigned long *ioreq_gfn,
+                              unsigned long *bufioreq_gfn,
+                              evtchn_port_t *bufioreq_port)
+{
+    struct hvm_ioreq_server *s;
+    int rc;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    if ( ioreq_gfn || bufioreq_gfn )
+    {
+        rc = hvm_ioreq_server_map_pages(s);
+        if ( rc )
+            goto out;
+    }
+
+    if ( ioreq_gfn )
+        *ioreq_gfn = gfn_x(s->ioreq.gfn);
+
+    if ( HANDLE_BUFIOREQ(s) )
+    {
+        if ( bufioreq_gfn )
+            *bufioreq_gfn = gfn_x(s->bufioreq.gfn);
+
+        if ( bufioreq_port )
+            *bufioreq_port = s->bufioreq_evtchn;
+    }
+
+    rc = 0;
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
+                               unsigned long idx, mfn_t *mfn)
+{
+    struct hvm_ioreq_server *s;
+    int rc;
+
+    ASSERT(is_hvm_domain(d));
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    rc = hvm_ioreq_server_alloc_pages(s);
+    if ( rc )
+        goto out;
+
+    switch ( idx )
+    {
+    case XENMEM_resource_ioreq_server_frame_bufioreq:
+        rc = -ENOENT;
+        if ( !HANDLE_BUFIOREQ(s) )
+            goto out;
+
+        *mfn = page_to_mfn(s->bufioreq.page);
+        rc = 0;
+        break;
+
+    case XENMEM_resource_ioreq_server_frame_ioreq(0):
+        *mfn = page_to_mfn(s->ioreq.page);
+        rc = 0;
+        break;
+
+    default:
+        rc = -EINVAL;
+        break;
+    }
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
+                                     uint32_t type, uint64_t start,
+                                     uint64_t end)
+{
+    struct hvm_ioreq_server *s;
+    struct rangeset *r;
+    int rc;
+
+    if ( start > end )
+        return -EINVAL;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    switch ( type )
+    {
+    case XEN_DMOP_IO_RANGE_PORT:
+    case XEN_DMOP_IO_RANGE_MEMORY:
+    case XEN_DMOP_IO_RANGE_PCI:
+        r = s->range[type];
+        break;
+
+    default:
+        r = NULL;
+        break;
+    }
+
+    rc = -EINVAL;
+    if ( !r )
+        goto out;
+
+    rc = -EEXIST;
+    if ( rangeset_overlaps_range(r, start, end) )
+        goto out;
+
+    rc = rangeset_add_range(r, start, end);
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
+                                         uint32_t type, uint64_t start,
+                                         uint64_t end)
+{
+    struct hvm_ioreq_server *s;
+    struct rangeset *r;
+    int rc;
+
+    if ( start > end )
+        return -EINVAL;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    switch ( type )
+    {
+    case XEN_DMOP_IO_RANGE_PORT:
+    case XEN_DMOP_IO_RANGE_MEMORY:
+    case XEN_DMOP_IO_RANGE_PCI:
+        r = s->range[type];
+        break;
+
+    default:
+        r = NULL;
+        break;
+    }
+
+    rc = -EINVAL;
+    if ( !r )
+        goto out;
+
+    rc = -ENOENT;
+    if ( !rangeset_contains_range(r, start, end) )
+        goto out;
+
+    rc = rangeset_remove_range(r, start, end);
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
+                               bool enabled)
+{
+    struct hvm_ioreq_server *s;
+    int rc;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    s = get_ioreq_server(d, id);
+
+    rc = -ENOENT;
+    if ( !s )
+        goto out;
+
+    rc = -EPERM;
+    if ( s->emulator != current->domain )
+        goto out;
+
+    domain_pause(d);
+
+    if ( enabled )
+        hvm_ioreq_server_enable(s);
+    else
+        hvm_ioreq_server_disable(s);
+
+    domain_unpause(d);
+
+    rc = 0;
+
+ out:
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+    return rc;
+}
+
+int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v)
+{
+    struct hvm_ioreq_server *s;
+    unsigned int id;
+    int rc;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        rc = hvm_ioreq_server_add_vcpu(s, v);
+        if ( rc )
+            goto fail;
+    }
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return 0;
+
+ fail:
+    while ( ++id != MAX_NR_IOREQ_SERVERS )
+    {
+        s = GET_IOREQ_SERVER(d, id);
+
+        if ( !s )
+            continue;
+
+        hvm_ioreq_server_remove_vcpu(s, v);
+    }
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    return rc;
+}
+
+void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v)
+{
+    struct hvm_ioreq_server *s;
+    unsigned int id;
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+        hvm_ioreq_server_remove_vcpu(s, v);
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+}
+
+void hvm_destroy_all_ioreq_servers(struct domain *d)
+{
+    struct hvm_ioreq_server *s;
+    unsigned int id;
+
+    arch_hvm_ioreq_destroy(d);
+
+    spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+    /* No need to domain_pause() as the domain is being torn down */
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        hvm_ioreq_server_disable(s);
+
+        /*
+         * It is safe to call hvm_ioreq_server_deinit() prior to
+         * set_ioreq_server() since the target domain is being destroyed.
+         */
+        hvm_ioreq_server_deinit(s);
+        set_ioreq_server(d, id, NULL);
+
+        xfree(s);
+    }
+
+    spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+}
+
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+                                                 ioreq_t *p)
+{
+    struct hvm_ioreq_server *s;
+    uint8_t type;
+    uint64_t addr;
+    unsigned int id;
+
+    if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
+        return NULL;
+
+    hvm_get_ioreq_server_range_type(d, p, &type, &addr);
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        struct rangeset *r;
+
+        if ( !s->enabled )
+            continue;
+
+        r = s->range[type];
+
+        switch ( type )
+        {
+            unsigned long start, end;
+
+        case XEN_DMOP_IO_RANGE_PORT:
+            start = addr;
+            end = start + p->size - 1;
+            if ( rangeset_contains_range(r, start, end) )
+                return s;
+
+            break;
+
+        case XEN_DMOP_IO_RANGE_MEMORY:
+            start = hvm_mmio_first_byte(p);
+            end = hvm_mmio_last_byte(p);
+
+            if ( rangeset_contains_range(r, start, end) )
+                return s;
+
+            break;
+
+        case XEN_DMOP_IO_RANGE_PCI:
+            if ( rangeset_contains_singleton(r, addr >> 32) )
+            {
+                p->type = IOREQ_TYPE_PCI_CONFIG;
+                p->addr = addr;
+                return s;
+            }
+
+            break;
+        }
+    }
+
+    return NULL;
+}
+
+static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
+{
+    struct domain *d = current->domain;
+    struct hvm_ioreq_page *iorp;
+    buffered_iopage_t *pg;
+    buf_ioreq_t bp = { .data = p->data,
+                       .addr = p->addr,
+                       .type = p->type,
+                       .dir = p->dir };
+    /* Timeoffset sends 64b data, but no address. Use two consecutive slots. */
+    int qw = 0;
+
+    /* Ensure buffered_iopage fits in a page */
+    BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
+
+    iorp = &s->bufioreq;
+    pg = iorp->va;
+
+    if ( !pg )
+        return IOREQ_IO_UNHANDLED;
+
+    /*
+     * Return 0 for the cases we can't deal with:
+     *  - 'addr' is only a 20-bit field, so we cannot address beyond 1MB
+     *  - we cannot buffer accesses to guest memory buffers, as the guest
+     *    may expect the memory buffer to be synchronously accessed
+     *  - the count field is usually used with data_is_ptr and since we don't
+     *    support data_is_ptr we do not waste space for the count field either
+     */
+    if ( (p->addr > 0xffffful) || p->data_is_ptr || (p->count != 1) )
+        return 0;
+
+    switch ( p->size )
+    {
+    case 1:
+        bp.size = 0;
+        break;
+    case 2:
+        bp.size = 1;
+        break;
+    case 4:
+        bp.size = 2;
+        break;
+    case 8:
+        bp.size = 3;
+        qw = 1;
+        break;
+    default:
+        gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size);
+        return IOREQ_IO_UNHANDLED;
+    }
+
+    spin_lock(&s->bufioreq_lock);
+
+    if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
+         (IOREQ_BUFFER_SLOT_NUM - qw) )
+    {
+        /* The queue is full: send the iopacket through the normal path. */
+        spin_unlock(&s->bufioreq_lock);
+        return IOREQ_IO_UNHANDLED;
+    }
+
+    pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
+
+    if ( qw )
+    {
+        bp.data = p->data >> 32;
+        pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
+    }
+
+    /* Make the ioreq_t visible /before/ write_pointer. */
+    smp_wmb();
+    pg->ptrs.write_pointer += qw ? 2 : 1;
+
+    /* Canonicalize read/write pointers to prevent their overflow. */
+    while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
+            qw++ < IOREQ_BUFFER_SLOT_NUM &&
+            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
+    {
+        union bufioreq_pointers old = pg->ptrs, new;
+        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
+
+        new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+        new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+        cmpxchg(&pg->ptrs.full, old.full, new.full);
+    }
+
+    notify_via_xen_event_channel(d, s->bufioreq_evtchn);
+    spin_unlock(&s->bufioreq_lock);
+
+    return IOREQ_IO_HANDLED;
+}
+
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
+                   bool buffered)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    struct hvm_ioreq_vcpu *sv;
+
+    ASSERT(s);
+
+    if ( buffered )
+        return hvm_send_buffered_ioreq(s, proto_p);
+
+    if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
+        return IOREQ_IO_RETRY;
+
+    list_for_each_entry ( sv,
+                          &s->ioreq_vcpu_list,
+                          list_entry )
+    {
+        if ( sv->vcpu == curr )
+        {
+            evtchn_port_t port = sv->ioreq_evtchn;
+            ioreq_t *p = get_ioreq(s, curr);
+
+            if ( unlikely(p->state != STATE_IOREQ_NONE) )
+            {
+                gprintk(XENLOG_ERR, "device model set bad IO state %d\n",
+                        p->state);
+                break;
+            }
+
+            if ( unlikely(p->vp_eport != port) )
+            {
+                gprintk(XENLOG_ERR, "device model set bad event channel %d\n",
+                        p->vp_eport);
+                break;
+            }
+
+            proto_p->state = STATE_IOREQ_NONE;
+            proto_p->vp_eport = port;
+            *p = *proto_p;
+
+            prepare_wait_on_xen_event_channel(port);
+
+            /*
+             * Following happens /after/ blocking and setting up ioreq
+             * contents. prepare_wait_on_xen_event_channel() is an implicit
+             * barrier.
+             */
+            p->state = STATE_IOREQ_READY;
+            notify_via_xen_event_channel(d, port);
+
+            sv->pending = true;
+            return IOREQ_IO_RETRY;
+        }
+    }
+
+    return IOREQ_IO_UNHANDLED;
+}
+
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered)
+{
+    struct domain *d = current->domain;
+    struct hvm_ioreq_server *s;
+    unsigned int id, failed = 0;
+
+    FOR_EACH_IOREQ_SERVER(d, id, s)
+    {
+        if ( !s->enabled )
+            continue;
+
+        if ( hvm_send_ioreq(s, p, buffered) == IOREQ_IO_UNHANDLED )
+            failed++;
+    }
+
+    return failed;
+}
+
+void hvm_ioreq_init(struct domain *d)
+{
+    spin_lock_init(&d->arch.hvm.ioreq_server.lock);
+
+    arch_hvm_ioreq_init(d);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/ioreq.h b/xen/include/asm-x86/hvm/ioreq.h
index e2588e9..0e871e0 100644
--- a/xen/include/asm-x86/hvm/ioreq.h
+++ b/xen/include/asm-x86/hvm/ioreq.h
@@ -19,41 +19,26 @@ 
 #ifndef __ASM_X86_HVM_IOREQ_H__
 #define __ASM_X86_HVM_IOREQ_H__
 
-bool hvm_io_pending(struct vcpu *v);
-bool handle_hvm_io_completion(struct vcpu *v);
-bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
+#include <asm/hvm/emulate.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/vmx/vmx.h>
+
+void handle_realmode_completion(void);
+
+void hvm_get_ioreq_server_range_type(struct domain *d,
+                                     ioreq_t *p,
+                                     uint8_t *type,
+                                     uint64_t *addr);
 
-int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
-                            ioservid_t *id);
-int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
-int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
-                              unsigned long *ioreq_gfn,
-                              unsigned long *bufioreq_gfn,
-                              evtchn_port_t *bufioreq_port);
-int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
-                               unsigned long idx, mfn_t *mfn);
-int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
-                                     uint32_t type, uint64_t start,
-                                     uint64_t end);
-int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
-                                         uint32_t type, uint64_t start,
-                                         uint64_t end);
 int hvm_map_mem_type_to_ioreq_server(struct domain *d, ioservid_t id,
                                      uint32_t type, uint32_t flags);
-int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
-                               bool enabled);
-
-int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
-void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
-void hvm_destroy_all_ioreq_servers(struct domain *d);
 
-struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
-                                                 ioreq_t *p);
-int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
-                   bool buffered);
-unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
+void arch_hvm_ioreq_init(struct domain *d);
+void arch_hvm_ioreq_destroy(struct domain *d);
 
-void hvm_ioreq_init(struct domain *d);
+#define IOREQ_IO_HANDLED     X86EMUL_OKAY
+#define IOREQ_IO_UNHANDLED   X86EMUL_UNHANDLEABLE
+#define IOREQ_IO_RETRY       X86EMUL_RETRY
 
 #endif /* __ASM_X86_HVM_IOREQ_H__ */
 
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 5ccd075..6c1feda 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -91,13 +91,6 @@  struct hvm_vcpu_io {
     const struct g2m_ioport *g2m_ioport;
 };
 
-static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
-{
-    return ioreq->state == STATE_IOREQ_READY &&
-           !ioreq->data_is_ptr &&
-           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
-}
-
 struct nestedvcpu {
     bool_t nv_guestmode; /* vcpu in guestmode? */
     void *nv_vvmcx; /* l1 guest virtual VMCB/VMCS */
diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
new file mode 100644
index 0000000..40b7b5e
--- /dev/null
+++ b/xen/include/xen/hvm/ioreq.h
@@ -0,0 +1,89 @@ 
+/*
+ * hvm.h: Hardware virtual machine assist interface definitions.
+ *
+ * Copyright (c) 2016 Citrix Systems Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HVM_IOREQ_H__
+#define __HVM_IOREQ_H__
+
+#include <xen/sched.h>
+
+#include <asm/hvm/ioreq.h>
+
+#define GET_IOREQ_SERVER(d, id) \
+    (d)->arch.hvm.ioreq_server.server[id]
+
+static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
+                                                        unsigned int id)
+{
+    if ( id >= MAX_NR_IOREQ_SERVERS )
+        return NULL;
+
+    return GET_IOREQ_SERVER(d, id);
+}
+
+static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
+{
+    return ioreq->state == STATE_IOREQ_READY &&
+           !ioreq->data_is_ptr &&
+           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
+}
+
+bool hvm_io_pending(struct vcpu *v);
+bool handle_hvm_io_completion(struct vcpu *v);
+bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
+
+int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
+                            ioservid_t *id);
+int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
+int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
+                              unsigned long *ioreq_gfn,
+                              unsigned long *bufioreq_gfn,
+                              evtchn_port_t *bufioreq_port);
+int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
+                               unsigned long idx, mfn_t *mfn);
+int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
+                                     uint32_t type, uint64_t start,
+                                     uint64_t end);
+int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
+                                         uint32_t type, uint64_t start,
+                                         uint64_t end);
+int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
+                               bool enabled);
+
+int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
+void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
+void hvm_destroy_all_ioreq_servers(struct domain *d);
+
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+                                                 ioreq_t *p);
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
+                   bool buffered);
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
+
+void hvm_ioreq_init(struct domain *d);
+
+#endif /* __HVM_IOREQ_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */