diff mbox

[RESEND,2/6] memory: introduce AddressSpaceOps and IOMMUObject

Message ID 1509710516-21084-3-git-send-email-yi.l.liu@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Liu, Yi L Nov. 3, 2017, 12:01 p.m. UTC
From: Peter Xu <peterx@redhat.com>

AddressSpaceOps is similar to MemoryRegionOps, it's just for address
spaces to store arch-specific hooks.

The first hook I would like to introduce is iommu_get(). Return an
IOMMUObject behind the AddressSpace.

For systems that have IOMMUs, we will create a special address
space per device which is different from system default address
space for it (please refer to pci_device_iommu_address_space()).
Normally when that happens, there will be one specific IOMMU (or
say, translation unit) stands right behind that new address space.

This iommu_get() fetches that guy behind the address space. Here,
the guy is defined as IOMMUObject, which includes a notifier_list
so far, may extend in future. Along with IOMMUObject, a new iommu
notifier mechanism is introduced. It would be used for virt-svm.
Also IOMMUObject can further have a IOMMUObjectOps which is similar
to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
on MemoryRegion.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
---
 hw/core/Makefile.objs   |  1 +
 hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
 include/exec/memory.h   | 22 +++++++++++++++
 include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 memory.c                | 10 +++++--
 5 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 hw/core/iommu.c
 create mode 100644 include/hw/core/iommu.h

Comments

David Gibson Nov. 13, 2017, 5:56 a.m. UTC | #1
On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> From: Peter Xu <peterx@redhat.com>
> 
> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> spaces to store arch-specific hooks.
> 
> The first hook I would like to introduce is iommu_get(). Return an
> IOMMUObject behind the AddressSpace.
> 
> For systems that have IOMMUs, we will create a special address
> space per device which is different from system default address
> space for it (please refer to pci_device_iommu_address_space()).
> Normally when that happens, there will be one specific IOMMU (or
> say, translation unit) stands right behind that new address space.
> 
> This iommu_get() fetches that guy behind the address space. Here,
> the guy is defined as IOMMUObject, which includes a notifier_list
> so far, may extend in future. Along with IOMMUObject, a new iommu
> notifier mechanism is introduced. It would be used for virt-svm.
> Also IOMMUObject can further have a IOMMUObjectOps which is similar
> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> on MemoryRegion.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>

Hi, sorry I didn't reply to the earlier postings of this after our
discussion in China.  I've been sick several times and very busy.

I still don't feel like there's an adequate explanation of exactly
what an IOMMUObject represents.   Obviously it can represent more than
a single translation window - since that's represented by the
IOMMUMR.  But what exactly do all the MRs - or whatever else - that
are represented by the IOMMUObject have in common, from a functional
point of view.

Even understanding the SVM stuff better than I did, I don't really see
why an AddressSpace is an obvious unit to have an IOMMUObject
associated with it.

> ---
>  hw/core/Makefile.objs   |  1 +
>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
>  include/exec/memory.h   | 22 +++++++++++++++
>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
>  memory.c                | 10 +++++--
>  5 files changed, 162 insertions(+), 2 deletions(-)
>  create mode 100644 hw/core/iommu.c
>  create mode 100644 include/hw/core/iommu.h
> 
> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> index f8d7a4a..d688412 100644
> --- a/hw/core/Makefile.objs
> +++ b/hw/core/Makefile.objs
> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
>  # irq.o needed for qdev GPIO handling:
>  common-obj-y += irq.o
>  common-obj-y += hotplug.o
> +common-obj-y += iommu.o
>  common-obj-y += nmi.o
>  
>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> new file mode 100644
> index 0000000..7c4fcfe
> --- /dev/null
> +++ b/hw/core/iommu.c
> @@ -0,0 +1,58 @@
> +/*
> + * QEMU emulation of IOMMU logic
> + *
> + * Copyright (C) 2017 Red Hat Inc.
> + *
> + * Authors: Peter Xu <peterx@redhat.com>,
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/core/iommu.h"
> +
> +void iommu_notifier_register(IOMMUObject *iommu,
> +                             IOMMUNotifier *n,
> +                             IOMMUNotifyFn fn,
> +                             IOMMUEvent event)
> +{
> +    n->event = event;
> +    n->iommu_notify = fn;
> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> +    return;
> +}
> +
> +void iommu_notifier_unregister(IOMMUObject *iommu,
> +                               IOMMUNotifier *notifier)
> +{
> +    IOMMUNotifier *cur, *next;
> +
> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> +        if (cur == notifier) {
> +            QLIST_REMOVE(cur, node);
> +            break;
> +        }
> +    }
> +}
> +
> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> +{
> +    IOMMUNotifier *cur;
> +
> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> +            cur->iommu_notify(cur, event_data);
> +        }
> +    }
> +}
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 03595e3..8350973 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -26,6 +26,7 @@
>  #include "qom/object.h"
>  #include "qemu/rcu.h"
>  #include "hw/qdev-core.h"
> +#include "hw/core/iommu.h"
>  
>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
>  
> @@ -301,6 +302,19 @@ struct MemoryListener {
>  };
>  
>  /**
> + * AddressSpaceOps: callbacks structure for address space specific operations
> + *
> + * @iommu_get: returns an IOMMU object that backs the address space.
> + *             Normally this should be NULL for generic address
> + *             spaces, and it's only used when there is one
> + *             translation unit behind this address space.
> + */
> +struct AddressSpaceOps {
> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> +};
> +typedef struct AddressSpaceOps AddressSpaceOps;
> +
> +/**
>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>   */
>  struct AddressSpace {
> @@ -316,6 +330,7 @@ struct AddressSpace {
>      struct MemoryRegionIoeventfd *ioeventfds;
>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> +    AddressSpaceOps as_ops;
>  };
>  
>  FlatView *address_space_to_flatview(AddressSpace *as);
> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
>  }
>  
> +/**
> + * address_space_iommu_get: Get the backend IOMMU for the address space
> + *
> + * @as: the address space to fetch IOMMU from
> + */
> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> +
>  #endif
>  
>  #endif
> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> new file mode 100644
> index 0000000..34387c0
> --- /dev/null
> +++ b/include/hw/core/iommu.h
> @@ -0,0 +1,73 @@
> +/*
> + * QEMU emulation of IOMMU logic
> + *
> + * Copyright (C) 2017 Red Hat Inc.
> + *
> + * Authors: Peter Xu <peterx@redhat.com>,
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_CORE_IOMMU_H
> +#define HW_CORE_IOMMU_H
> +
> +#include "qemu/queue.h"
> +
> +enum IOMMUEvent {
> +    IOMMU_EVENT_BIND_PASIDT,
> +};
> +typedef enum IOMMUEvent IOMMUEvent;
> +
> +struct IOMMUEventData {
> +    IOMMUEvent event;
> +    uint64_t length;
> +    void *data;
> +};
> +typedef struct IOMMUEventData IOMMUEventData;
> +
> +typedef struct IOMMUNotifier IOMMUNotifier;
> +
> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> +                              IOMMUEventData *event_data);
> +
> +struct IOMMUNotifier {
> +    IOMMUNotifyFn iommu_notify;
> +    /*
> +     * What events we are listening to. Let's allow multiple event
> +     * registrations from beginning.
> +     */
> +    IOMMUEvent event;
> +    QLIST_ENTRY(IOMMUNotifier) node;
> +};
> +
> +typedef struct IOMMUObject IOMMUObject;
> +
> +/*
> + * This stands for an IOMMU unit. Any translation device should have
> + * this struct inside its own structure to make sure it can leverage
> + * common IOMMU functionalities.
> + */
> +struct IOMMUObject {
> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> +};
> +
> +void iommu_notifier_register(IOMMUObject *iommu,
> +                             IOMMUNotifier *n,
> +                             IOMMUNotifyFn fn,
> +                             IOMMUEvent event);
> +void iommu_notifier_unregister(IOMMUObject *iommu,
> +                               IOMMUNotifier *notifier);
> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> +
> +#endif
> diff --git a/memory.c b/memory.c
> index 77fb3ef..307f665 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -235,8 +235,6 @@ struct FlatView {
>      MemoryRegion *root;
>  };
>  
> -typedef struct AddressSpaceOps AddressSpaceOps;
> -
>  #define FOR_EACH_FLAT_RANGE(var, view)          \
>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
>  
> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
>      memory_region_unref(as->root);
>  }
>  
> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> +{
> +    if (!as->as_ops.iommu_get) {
> +        return NULL;
> +    }
> +    return as->as_ops.iommu_get(as);
> +}
> +
>  void address_space_destroy(AddressSpace *as)
>  {
>      MemoryRegion *root = as->root;
Peter Xu Nov. 13, 2017, 8:28 a.m. UTC | #2
On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > From: Peter Xu <peterx@redhat.com>
> > 
> > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > spaces to store arch-specific hooks.
> > 
> > The first hook I would like to introduce is iommu_get(). Return an
> > IOMMUObject behind the AddressSpace.
> > 
> > For systems that have IOMMUs, we will create a special address
> > space per device which is different from system default address
> > space for it (please refer to pci_device_iommu_address_space()).
> > Normally when that happens, there will be one specific IOMMU (or
> > say, translation unit) stands right behind that new address space.
> > 
> > This iommu_get() fetches that guy behind the address space. Here,
> > the guy is defined as IOMMUObject, which includes a notifier_list
> > so far, may extend in future. Along with IOMMUObject, a new iommu
> > notifier mechanism is introduced. It would be used for virt-svm.
> > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > on MemoryRegion.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> 
> Hi, sorry I didn't reply to the earlier postings of this after our
> discussion in China.  I've been sick several times and very busy.
> 
> I still don't feel like there's an adequate explanation of exactly
> what an IOMMUObject represents.   Obviously it can represent more than
> a single translation window - since that's represented by the
> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> are represented by the IOMMUObject have in common, from a functional
> point of view.
> 
> Even understanding the SVM stuff better than I did, I don't really see
> why an AddressSpace is an obvious unit to have an IOMMUObject
> associated with it.

Here's what I thought about it: IOMMUObject was planned to be the
abstraction of the hardware translation unit, which is a higher level
of the translated address spaces.  Say, for each PCI device, it can
have its own translated address space.  However for multiple PCI
devices, they can be sharing the same translation unit that handles
the translation requests from different devices.  That's the case for
Intel platforms.  We introduced this IOMMUObject because sometimes we
want to do something with that translation unit rather than a specific
device, in which we need a general IOMMU device handle.

IIRC one issue left over during last time's discussion was that there
could be more complicated IOMMU models. E.g., one device's DMA request
can be translated nestedly by two or multiple IOMMUs, and current
proposal cannot really handle that complicated hierachy.  I'm just
thinking whether we can start from a simple model (say, we don't allow
nested IOMMUs, and actually we don't even allow multiple IOMMUs so
far), then we can evolve from that point in the future.

Also, I thought there were something you mentioned that this approach
is not correct for Power systems, but I can't really remember the
details...  Anyways, I think this is not the only approach to solve
the problem, and I believe any new better idea would be greatly
welcomed as well. :)

Thanks,

> 
> > ---
> >  hw/core/Makefile.objs   |  1 +
> >  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
> >  include/exec/memory.h   | 22 +++++++++++++++
> >  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  memory.c                | 10 +++++--
> >  5 files changed, 162 insertions(+), 2 deletions(-)
> >  create mode 100644 hw/core/iommu.c
> >  create mode 100644 include/hw/core/iommu.h
> > 
> > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> > index f8d7a4a..d688412 100644
> > --- a/hw/core/Makefile.objs
> > +++ b/hw/core/Makefile.objs
> > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
> >  # irq.o needed for qdev GPIO handling:
> >  common-obj-y += irq.o
> >  common-obj-y += hotplug.o
> > +common-obj-y += iommu.o
> >  common-obj-y += nmi.o
> >  
> >  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> > diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> > new file mode 100644
> > index 0000000..7c4fcfe
> > --- /dev/null
> > +++ b/hw/core/iommu.c
> > @@ -0,0 +1,58 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "hw/core/iommu.h"
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event)
> > +{
> > +    n->event = event;
> > +    n->iommu_notify = fn;
> > +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> > +    return;
> > +}
> > +
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier)
> > +{
> > +    IOMMUNotifier *cur, *next;
> > +
> > +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> > +        if (cur == notifier) {
> > +            QLIST_REMOVE(cur, node);
> > +            break;
> > +        }
> > +    }
> > +}
> > +
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> > +{
> > +    IOMMUNotifier *cur;
> > +
> > +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> > +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> > +            cur->iommu_notify(cur, event_data);
> > +        }
> > +    }
> > +}
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 03595e3..8350973 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -26,6 +26,7 @@
> >  #include "qom/object.h"
> >  #include "qemu/rcu.h"
> >  #include "hw/qdev-core.h"
> > +#include "hw/core/iommu.h"
> >  
> >  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
> >  
> > @@ -301,6 +302,19 @@ struct MemoryListener {
> >  };
> >  
> >  /**
> > + * AddressSpaceOps: callbacks structure for address space specific operations
> > + *
> > + * @iommu_get: returns an IOMMU object that backs the address space.
> > + *             Normally this should be NULL for generic address
> > + *             spaces, and it's only used when there is one
> > + *             translation unit behind this address space.
> > + */
> > +struct AddressSpaceOps {
> > +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> > +};
> > +typedef struct AddressSpaceOps AddressSpaceOps;
> > +
> > +/**
> >   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >   */
> >  struct AddressSpace {
> > @@ -316,6 +330,7 @@ struct AddressSpace {
> >      struct MemoryRegionIoeventfd *ioeventfds;
> >      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> > +    AddressSpaceOps as_ops;
> >  };
> >  
> >  FlatView *address_space_to_flatview(AddressSpace *as);
> > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
> >      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
> >  }
> >  
> > +/**
> > + * address_space_iommu_get: Get the backend IOMMU for the address space
> > + *
> > + * @as: the address space to fetch IOMMU from
> > + */
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> > +
> >  #endif
> >  
> >  #endif
> > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> > new file mode 100644
> > index 0000000..34387c0
> > --- /dev/null
> > +++ b/include/hw/core/iommu.h
> > @@ -0,0 +1,73 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef HW_CORE_IOMMU_H
> > +#define HW_CORE_IOMMU_H
> > +
> > +#include "qemu/queue.h"
> > +
> > +enum IOMMUEvent {
> > +    IOMMU_EVENT_BIND_PASIDT,
> > +};
> > +typedef enum IOMMUEvent IOMMUEvent;
> > +
> > +struct IOMMUEventData {
> > +    IOMMUEvent event;
> > +    uint64_t length;
> > +    void *data;
> > +};
> > +typedef struct IOMMUEventData IOMMUEventData;
> > +
> > +typedef struct IOMMUNotifier IOMMUNotifier;
> > +
> > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> > +                              IOMMUEventData *event_data);
> > +
> > +struct IOMMUNotifier {
> > +    IOMMUNotifyFn iommu_notify;
> > +    /*
> > +     * What events we are listening to. Let's allow multiple event
> > +     * registrations from beginning.
> > +     */
> > +    IOMMUEvent event;
> > +    QLIST_ENTRY(IOMMUNotifier) node;
> > +};
> > +
> > +typedef struct IOMMUObject IOMMUObject;
> > +
> > +/*
> > + * This stands for an IOMMU unit. Any translation device should have
> > + * this struct inside its own structure to make sure it can leverage
> > + * common IOMMU functionalities.
> > + */
> > +struct IOMMUObject {
> > +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> > +};
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event);
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier);
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> > +
> > +#endif
> > diff --git a/memory.c b/memory.c
> > index 77fb3ef..307f665 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -235,8 +235,6 @@ struct FlatView {
> >      MemoryRegion *root;
> >  };
> >  
> > -typedef struct AddressSpaceOps AddressSpaceOps;
> > -
> >  #define FOR_EACH_FLAT_RANGE(var, view)          \
> >      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
> >  
> > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
> >      memory_region_unref(as->root);
> >  }
> >  
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> > +{
> > +    if (!as->as_ops.iommu_get) {
> > +        return NULL;
> > +    }
> > +    return as->as_ops.iommu_get(as);
> > +}
> > +
> >  void address_space_destroy(AddressSpace *as)
> >  {
> >      MemoryRegion *root = as->root;
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
Liu, Yi L Nov. 13, 2017, 9:58 a.m. UTC | #3
On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > From: Peter Xu <peterx@redhat.com>
> > 
> > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > spaces to store arch-specific hooks.
> > 
> > The first hook I would like to introduce is iommu_get(). Return an
> > IOMMUObject behind the AddressSpace.
> > 
> > For systems that have IOMMUs, we will create a special address
> > space per device which is different from system default address
> > space for it (please refer to pci_device_iommu_address_space()).
> > Normally when that happens, there will be one specific IOMMU (or
> > say, translation unit) stands right behind that new address space.
> > 
> > This iommu_get() fetches that guy behind the address space. Here,
> > the guy is defined as IOMMUObject, which includes a notifier_list
> > so far, may extend in future. Along with IOMMUObject, a new iommu
> > notifier mechanism is introduced. It would be used for virt-svm.
> > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > on MemoryRegion.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> 
> Hi, sorry I didn't reply to the earlier postings of this after our
> discussion in China.  I've been sick several times and very busy.

Hi David,

Fully understood. I'll try my best to address your question. Also,
feel free to input further questions, anyhow, the more we discuss the
better work we done.

> I still don't feel like there's an adequate explanation of exactly
> what an IOMMUObject represents.   Obviously it can represent more than

IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
specific operations. One of the key purpose of IOMMUObject is to
introduce a notifier framework to let iommu emulator to be able to
do iommu operations other than MAP/UNMAP. As IOMMU grows more and
more feature, MAP/UNMAP is not the only operation iommu emulator needs
to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
has it. may correct me on it. As my cover letter mentioned, MR based
notifier framework doesn’t work for the newly added IOMMU operations.
Like bind guest pasid table pointer to host and propagate guest's
iotlb flush to host.

> a single translation window - since that's represented by the
> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> are represented by the IOMMUObject have in common, from a functional
> point of view.

Let me take virt-SVM as an example. As far as I know, for virt-SVM,
the implementation of different vendors are similar. The key design
is to have a nested translation(aka. two stage translation). It is to
have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
mapping. Similar to EPT based virt-MMU solution.

In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
needs to trap specific guest iommu operation and pass the gVA->gPA
mapping knowledge to host through a notifier(newly added one). In VT-d,
it is called bind guest pasid table to host.

Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
needs to propagate it to host. Here, MAP/UNMAP is not suitable since
this gVA iotlb flush here doesn’t require to modify host iommu
translation table. At the time gVA iotlb flush is issued, the gVA->gPA
mapping has already modified. Host iommu only needs to reference it when
performing address translation. But before host iommu perform translation,
it needs to flush the old gVA cache. In VT-d, it is called 1st level cache
flushing.

Both of the two notifiers(operations) has no direct relationship with MR,
instead they highly depends on virt-iommu itself. If virt-iommu exists,
then the two notfiers are needed, if not, then it's not.
 
> Even understanding the SVM stuff better than I did, I don't really see
> why an AddressSpace is an obvious unit to have an IOMMUObject
> associated with it.

This will benefit the notifier registration. As my comments above, the
IOMMUObject is to represent iommu. Associate an AddressSpace with an
IOMMUObject makes it easy to check if it is necessary to register the
notifiers. If no IOMMUObject, means no virt-iommu exposed to guest, then
no need to register notifiers. For this, I also considered to use the
MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the
subregions and register notfiers if it is an iommu MemoryRegion. Peter
mentioned it may not work for SPAR. So he proposed associating an
AddressSpace with an IOMMUObject. I think it wroks and easier, so I
didn’t object it.

https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html

+    /* Check if vIOMMU exists */
+    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
+        if (memory_region_is_iommu(subregion)) {
+            IOMMUNotifier n1;
+
+            /*
+             FIXME: current iommu notifier is actually designed for
+             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
+             notifiers other than MAP/UNMAP, so it'll be better to
+             split the non-IOMMUTLB notifier from the current IOMMUTLB
+             notifier framewrok.
+             */
+            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
+                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
+                                0,
+                                0);
+            vfio_register_notifier(group->container,
+                                   subregion,
+                                   0,
+                                   &n1);
+        }
+    }
+

Thanks,
Yi L

> > ---
> >  hw/core/Makefile.objs   |  1 +
> >  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
> >  include/exec/memory.h   | 22 +++++++++++++++
> >  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  memory.c                | 10 +++++--
> >  5 files changed, 162 insertions(+), 2 deletions(-)
> >  create mode 100644 hw/core/iommu.c
> >  create mode 100644 include/hw/core/iommu.h
> > 
> > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> > index f8d7a4a..d688412 100644
> > --- a/hw/core/Makefile.objs
> > +++ b/hw/core/Makefile.objs
> > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
> >  # irq.o needed for qdev GPIO handling:
> >  common-obj-y += irq.o
> >  common-obj-y += hotplug.o
> > +common-obj-y += iommu.o
> >  common-obj-y += nmi.o
> >  
> >  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> > diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> > new file mode 100644
> > index 0000000..7c4fcfe
> > --- /dev/null
> > +++ b/hw/core/iommu.c
> > @@ -0,0 +1,58 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "hw/core/iommu.h"
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event)
> > +{
> > +    n->event = event;
> > +    n->iommu_notify = fn;
> > +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> > +    return;
> > +}
> > +
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier)
> > +{
> > +    IOMMUNotifier *cur, *next;
> > +
> > +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> > +        if (cur == notifier) {
> > +            QLIST_REMOVE(cur, node);
> > +            break;
> > +        }
> > +    }
> > +}
> > +
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> > +{
> > +    IOMMUNotifier *cur;
> > +
> > +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> > +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> > +            cur->iommu_notify(cur, event_data);
> > +        }
> > +    }
> > +}
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 03595e3..8350973 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -26,6 +26,7 @@
> >  #include "qom/object.h"
> >  #include "qemu/rcu.h"
> >  #include "hw/qdev-core.h"
> > +#include "hw/core/iommu.h"
> >  
> >  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
> >  
> > @@ -301,6 +302,19 @@ struct MemoryListener {
> >  };
> >  
> >  /**
> > + * AddressSpaceOps: callbacks structure for address space specific operations
> > + *
> > + * @iommu_get: returns an IOMMU object that backs the address space.
> > + *             Normally this should be NULL for generic address
> > + *             spaces, and it's only used when there is one
> > + *             translation unit behind this address space.
> > + */
> > +struct AddressSpaceOps {
> > +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> > +};
> > +typedef struct AddressSpaceOps AddressSpaceOps;
> > +
> > +/**
> >   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >   */
> >  struct AddressSpace {
> > @@ -316,6 +330,7 @@ struct AddressSpace {
> >      struct MemoryRegionIoeventfd *ioeventfds;
> >      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> > +    AddressSpaceOps as_ops;
> >  };
> >  
> >  FlatView *address_space_to_flatview(AddressSpace *as);
> > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
> >      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
> >  }
> >  
> > +/**
> > + * address_space_iommu_get: Get the backend IOMMU for the address space
> > + *
> > + * @as: the address space to fetch IOMMU from
> > + */
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> > +
> >  #endif
> >  
> >  #endif
> > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> > new file mode 100644
> > index 0000000..34387c0
> > --- /dev/null
> > +++ b/include/hw/core/iommu.h
> > @@ -0,0 +1,73 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef HW_CORE_IOMMU_H
> > +#define HW_CORE_IOMMU_H
> > +
> > +#include "qemu/queue.h"
> > +
> > +enum IOMMUEvent {
> > +    IOMMU_EVENT_BIND_PASIDT,
> > +};
> > +typedef enum IOMMUEvent IOMMUEvent;
> > +
> > +struct IOMMUEventData {
> > +    IOMMUEvent event;
> > +    uint64_t length;
> > +    void *data;
> > +};
> > +typedef struct IOMMUEventData IOMMUEventData;
> > +
> > +typedef struct IOMMUNotifier IOMMUNotifier;
> > +
> > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> > +                              IOMMUEventData *event_data);
> > +
> > +struct IOMMUNotifier {
> > +    IOMMUNotifyFn iommu_notify;
> > +    /*
> > +     * What events we are listening to. Let's allow multiple event
> > +     * registrations from beginning.
> > +     */
> > +    IOMMUEvent event;
> > +    QLIST_ENTRY(IOMMUNotifier) node;
> > +};
> > +
> > +typedef struct IOMMUObject IOMMUObject;
> > +
> > +/*
> > + * This stands for an IOMMU unit. Any translation device should have
> > + * this struct inside its own structure to make sure it can leverage
> > + * common IOMMU functionalities.
> > + */
> > +struct IOMMUObject {
> > +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> > +};
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event);
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier);
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> > +
> > +#endif
> > diff --git a/memory.c b/memory.c
> > index 77fb3ef..307f665 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -235,8 +235,6 @@ struct FlatView {
> >      MemoryRegion *root;
> >  };
> >  
> > -typedef struct AddressSpaceOps AddressSpaceOps;
> > -
> >  #define FOR_EACH_FLAT_RANGE(var, view)          \
> >      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
> >  
> > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
> >      memory_region_unref(as->root);
> >  }
> >  
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> > +{
> > +    if (!as->as_ops.iommu_get) {
> > +        return NULL;
> > +    }
> > +    return as->as_ops.iommu_get(as);
> > +}
> > +
> >  void address_space_destroy(AddressSpace *as)
> >  {
> >      MemoryRegion *root = as->root;
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson Nov. 14, 2017, 12:59 a.m. UTC | #4
On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > From: Peter Xu <peterx@redhat.com>
> > > 
> > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > spaces to store arch-specific hooks.
> > > 
> > > The first hook I would like to introduce is iommu_get(). Return an
> > > IOMMUObject behind the AddressSpace.
> > > 
> > > For systems that have IOMMUs, we will create a special address
> > > space per device which is different from system default address
> > > space for it (please refer to pci_device_iommu_address_space()).
> > > Normally when that happens, there will be one specific IOMMU (or
> > > say, translation unit) stands right behind that new address space.
> > > 
> > > This iommu_get() fetches that guy behind the address space. Here,
> > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > notifier mechanism is introduced. It would be used for virt-svm.
> > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > on MemoryRegion.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > 
> > Hi, sorry I didn't reply to the earlier postings of this after our
> > discussion in China.  I've been sick several times and very busy.
> > 
> > I still don't feel like there's an adequate explanation of exactly
> > what an IOMMUObject represents.   Obviously it can represent more than
> > a single translation window - since that's represented by the
> > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > are represented by the IOMMUObject have in common, from a functional
> > point of view.
> > 
> > Even understanding the SVM stuff better than I did, I don't really see
> > why an AddressSpace is an obvious unit to have an IOMMUObject
> > associated with it.
> 
> Here's what I thought about it: IOMMUObject was planned to be the
> abstraction of the hardware translation unit, which is a higher level
> of the translated address spaces.  Say, for each PCI device, it can
> have its own translated address space.  However for multiple PCI
> devices, they can be sharing the same translation unit that handles
> the translation requests from different devices.  That's the case for
> Intel platforms.  We introduced this IOMMUObject because sometimes we
> want to do something with that translation unit rather than a specific
> device, in which we need a general IOMMU device handle.

Ok, but what does "hardware translation unit" mean in practice.  The
guest neither knows nor cares, which bits of IOMMU translation happen
to be included in the same bundle of silicon.  It only cares what the
behaviour is.  What behavioural characteristics does a single
IOMMUObject have?

> IIRC one issue left over during last time's discussion was that there
> could be more complicated IOMMU models. E.g., one device's DMA request
> can be translated nestedly by two or multiple IOMMUs, and current
> proposal cannot really handle that complicated hierachy.  I'm just
> thinking whether we can start from a simple model (say, we don't allow
> nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> far), then we can evolve from that point in the future.
> 
> Also, I thought there were something you mentioned that this approach
> is not correct for Power systems, but I can't really remember the
> details...  Anyways, I think this is not the only approach to solve
> the problem, and I believe any new better idea would be greatly
> welcomed as well. :)

So, some of my initial comments were based on a misunderstanding of
what was proposed here - since discussing this with Yi at LinuxCon
Beijing, I have a better idea of what's going on.

On POWER - or rather the "pseries" platform, which is paravirtualized.
We can have multiple vIOMMU windows (usually 2) for a single virtual
PCI host bridge.  Because of the paravirtualization, the mapping to
hardware is fuzzy, but for passthrough devices they will both be
implemented by the IOMMU built into the physical host bridge.  That
isn't importat to the guest, though - all operations happen at the
window level.

The other thing that bothers me here is the way it's attached to an
AddressSpace.  IIUC how SVM works, the whole point is that the device
no longer writes into a specific PCI address space.  Instead, it
writes directly into a process address space.  So it seems to me more
that SVM should operate at the PCI level, and disassociate the device
from the normal PCI address space entirely, rather than hooking up
something via that address space.
Peter Xu Nov. 14, 2017, 3:31 a.m. UTC | #5
On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > From: Peter Xu <peterx@redhat.com>
> > > > 
> > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > spaces to store arch-specific hooks.
> > > > 
> > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > IOMMUObject behind the AddressSpace.
> > > > 
> > > > For systems that have IOMMUs, we will create a special address
> > > > space per device which is different from system default address
> > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > Normally when that happens, there will be one specific IOMMU (or
> > > > say, translation unit) stands right behind that new address space.
> > > > 
> > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > on MemoryRegion.
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > 
> > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > discussion in China.  I've been sick several times and very busy.
> > > 
> > > I still don't feel like there's an adequate explanation of exactly
> > > what an IOMMUObject represents.   Obviously it can represent more than
> > > a single translation window - since that's represented by the
> > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > are represented by the IOMMUObject have in common, from a functional
> > > point of view.
> > > 
> > > Even understanding the SVM stuff better than I did, I don't really see
> > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > associated with it.
> > 
> > Here's what I thought about it: IOMMUObject was planned to be the
> > abstraction of the hardware translation unit, which is a higher level
> > of the translated address spaces.  Say, for each PCI device, it can
> > have its own translated address space.  However for multiple PCI
> > devices, they can be sharing the same translation unit that handles
> > the translation requests from different devices.  That's the case for
> > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > want to do something with that translation unit rather than a specific
> > device, in which we need a general IOMMU device handle.
> 
> Ok, but what does "hardware translation unit" mean in practice.  The
> guest neither knows nor cares, which bits of IOMMU translation happen
> to be included in the same bundle of silicon.  It only cares what the
> behaviour is.  What behavioural characteristics does a single
> IOMMUObject have?

In VT-d (I believe the same to ARM SMMUs), IMHO the special thing is
that the translation windows (and device address spaces in QEMU) are
only talking about second level translations, but not first level,
while virt-svm needs to play with first level translations.  Until
now, AFAIU we don't really have a good interface for first level
translations at all (aka. the process address space).

> 
> > IIRC one issue left over during last time's discussion was that there
> > could be more complicated IOMMU models. E.g., one device's DMA request
> > can be translated nestedly by two or multiple IOMMUs, and current
> > proposal cannot really handle that complicated hierachy.  I'm just
> > thinking whether we can start from a simple model (say, we don't allow
> > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > far), then we can evolve from that point in the future.
> > 
> > Also, I thought there were something you mentioned that this approach
> > is not correct for Power systems, but I can't really remember the
> > details...  Anyways, I think this is not the only approach to solve
> > the problem, and I believe any new better idea would be greatly
> > welcomed as well. :)
> 
> So, some of my initial comments were based on a misunderstanding of
> what was proposed here - since discussing this with Yi at LinuxCon
> Beijing, I have a better idea of what's going on.
> 
> On POWER - or rather the "pseries" platform, which is paravirtualized.
> We can have multiple vIOMMU windows (usually 2) for a single virtual
> PCI host bridge.  Because of the paravirtualization, the mapping to
> hardware is fuzzy, but for passthrough devices they will both be
> implemented by the IOMMU built into the physical host bridge.  That
> isn't importat to the guest, though - all operations happen at the
> window level.

Now I know that for Power it may not have anything like a "translation
unit" but everything is defined as "translation windows" in the
guests.  However the problem still exist for some other platforms.
Say, for Intel we have emulated VT-d; for ARM, we have vSMMU.  AFAIU
these platforms do have their translation units, and even for ARM it
should need such an interface (or any better interfaces, which are
always welcomed) for virt-svm to work.  Otherwise I don't know a way
to configure the first level translation tables.

Meanwhile, IMO this abstraction should not really affect pseries - it
should be only useful for those platforms who would like to use it.
For pseries, we can just ignore that new interface if we don't really
even have such a translation unit.

> 
> The other thing that bothers me here is the way it's attached to an
> AddressSpace.  IIUC how SVM works, the whole point is that the device
> no longer writes into a specific PCI address space.  Instead, it
> writes directly into a process address space.  So it seems to me more
> that SVM should operate at the PCI level, and disassociate the device
> from the normal PCI address space entirely, rather than hooking up
> something via that address space.

IMO the PCI address space is still used.  For virt-svm, host IOMMU
will be working in nested translation mode, so we should be having two
mappings working in parallel:

  1. DPDK process (in guest) address space mapping (GVA -> GPA)
  2. guest direct memory mappings (GPA -> HPA)

And here AFAIU the 2nd mapping is working exactly like general PCI
devices, the only difference is that the 2nd level mapping is always
static, just like when IOMMU passthrough is enabled for that device.

So, IMHO virt-SVM is not really in parallel with PCI subsystem.  For
the SVM in guest, it may be different, since it should only be using
first level translations.  However to implement virt-SVM, IMHO we not
only need existing PCI address space translation logic, we also need
an extra way to configure the first level mappings, as discussed.

Thanks,

> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
Eric Auger Nov. 14, 2017, 8:53 a.m. UTC | #6
Hi Yi L,

On 13/11/2017 10:58, Liu, Yi L wrote:
> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
>> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
>>> From: Peter Xu <peterx@redhat.com>
>>>
>>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
>>> spaces to store arch-specific hooks.
>>>
>>> The first hook I would like to introduce is iommu_get(). Return an
>>> IOMMUObject behind the AddressSpace.
>>>
>>> For systems that have IOMMUs, we will create a special address
>>> space per device which is different from system default address
>>> space for it (please refer to pci_device_iommu_address_space()).
>>> Normally when that happens, there will be one specific IOMMU (or
>>> say, translation unit) stands right behind that new address space.
>>>
>>> This iommu_get() fetches that guy behind the address space. Here,
>>> the guy is defined as IOMMUObject, which includes a notifier_list
>>> so far, may extend in future. Along with IOMMUObject, a new iommu
>>> notifier mechanism is introduced. It would be used for virt-svm.
>>> Also IOMMUObject can further have a IOMMUObjectOps which is similar
>>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
>>> on MemoryRegion.
>>>
>>> Signed-off-by: Peter Xu <peterx@redhat.com>
>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
>>
>> Hi, sorry I didn't reply to the earlier postings of this after our
>> discussion in China.  I've been sick several times and very busy.
> 
> Hi David,
> 
> Fully understood. I'll try my best to address your question. Also,
> feel free to input further questions, anyhow, the more we discuss the
> better work we done.
> 
>> I still don't feel like there's an adequate explanation of exactly
>> what an IOMMUObject represents.   Obviously it can represent more than
> 
> IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
> specific operations. One of the key purpose of IOMMUObject is to
> introduce a notifier framework to let iommu emulator to be able to
> do iommu operations other than MAP/UNMAP. As IOMMU grows more and
> more feature, MAP/UNMAP is not the only operation iommu emulator needs
> to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
> has it. may correct me on it. As my cover letter mentioned, MR based
> notifier framework doesn’t work for the newly added IOMMU operations.
> Like bind guest pasid table pointer to host and propagate guest's
> iotlb flush to host.
> 
>> a single translation window - since that's represented by the
>> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
>> are represented by the IOMMUObject have in common, from a functional
>> point of view.
> 
> Let me take virt-SVM as an example. As far as I know, for virt-SVM,
> the implementation of different vendors are similar. The key design
> is to have a nested translation(aka. two stage translation). It is to
> have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
> mapping. Similar to EPT based virt-MMU solution.
> 
> In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
> keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
> needs to trap specific guest iommu operation and pass the gVA->gPA
> mapping knowledge to host through a notifier(newly added one). In VT-d,
> it is called bind guest pasid table to host.

What I don't get is the PASID table is per extended context entry. I
understand this latter is indexed by PCI device function. And today MR
are created per PCIe device if I am not wrong. So why can't we have 1
new MR notifier dedicated to PASID table passing? My understanding is
the MR, having a 1-1 correspondence with a PCIe device and thus a
context could be of right granularity. Then I understand the only flags
we currently have are NONE, MAP and UNMAP but couldn't we add a new one
for PASID TABLE passing? So this is not crystal clear to me why MR
notifiers are not adapted to PASID table passing.
> 
> Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
> needs to propagate it to host. Here, MAP/UNMAP is not suitable since
> this gVA iotlb flush here doesn’t require to modify host iommu
> translation table.
I don't really get this argument. IOMMUNotifier just is a notifier that
is attached to an IOMMU MR and calls a an IOMMUNotify function, right?
Then the role of the function currently is attached to the currently
existing flags, MAP, UNMAP. This is not linked to an action on the
physical IOMMU, right?

 At the time gVA iotlb flush is issued, the gVA->gPA
> mapping has already modified. Host iommu only needs to reference it when
> performing address translation. But before host iommu perform translation,
> it needs to flush the old gVA cache. In VT-d, it is called 1st level cache
> flushing.

The fact MR notifiers may not be relevant could be linked to the nature
of the tagging of the structures you want to flush. My understanding is
if you want to flush by source-id, MR granularity could be fine. Please
could you clarify why do you need an iommu wide operation in that case.
> 
> Both of the two notifiers(operations) has no direct relationship with MR,
> instead they highly depends on virt-iommu itself.
As described above this is not obvious to me. Please could you clarify
why source-id granularity (which I understand has a 1-1 granularity with
MR/AS is not the correct granularity). Of course, please correct me if
my understanding of MR mapping is not correct.

Thanks

Eric

 If virt-iommu exists,
> then the two notfiers are needed, if not, then it's not.
>  
>> Even understanding the SVM stuff better than I did, I don't really see
>> why an AddressSpace is an obvious unit to have an IOMMUObject
>> associated with it.
> 
> This will benefit the notifier registration. As my comments above, the
> IOMMUObject is to represent iommu. Associate an AddressSpace with an
> IOMMUObject makes it easy to check if it is necessary to register the
> notifiers.
 If no IOMMUObject, means no virt-iommu exposed to guest, then
> no need to register notifiers. For this, I also considered to use the
> MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the
> subregions and register notfiers if it is an iommu MemoryRegion. Peter
> mentioned it may not work for SPAR. So he proposed associating an
> AddressSpace with an IOMMUObject. I think it wroks and easier, so I
> didn’t object it.
> 
> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
> 
> +    /* Check if vIOMMU exists */
> +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
> +        if (memory_region_is_iommu(subregion)) {
> +            IOMMUNotifier n1;
> +
> +            /*
> +             FIXME: current iommu notifier is actually designed for
> +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
> +             notifiers other than MAP/UNMAP, so it'll be better to
> +             split the non-IOMMUTLB notifier from the current IOMMUTLB
> +             notifier framewrok.
> +             */
> +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
> +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
> +                                0,
> +                                0);
> +            vfio_register_notifier(group->container,
> +                                   subregion,
> +                                   0,
> +                                   &n1);
> +        }
> +    }
> +
> 
> Thanks,
> Yi L
> 
>>> ---
>>>  hw/core/Makefile.objs   |  1 +
>>>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
>>>  include/exec/memory.h   | 22 +++++++++++++++
>>>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>  memory.c                | 10 +++++--
>>>  5 files changed, 162 insertions(+), 2 deletions(-)
>>>  create mode 100644 hw/core/iommu.c
>>>  create mode 100644 include/hw/core/iommu.h
>>>
>>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
>>> index f8d7a4a..d688412 100644
>>> --- a/hw/core/Makefile.objs
>>> +++ b/hw/core/Makefile.objs
>>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
>>>  # irq.o needed for qdev GPIO handling:
>>>  common-obj-y += irq.o
>>>  common-obj-y += hotplug.o
>>> +common-obj-y += iommu.o
>>>  common-obj-y += nmi.o
>>>  
>>>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
>>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
>>> new file mode 100644
>>> index 0000000..7c4fcfe
>>> --- /dev/null
>>> +++ b/hw/core/iommu.c
>>> @@ -0,0 +1,58 @@
>>> +/*
>>> + * QEMU emulation of IOMMU logic
>>> + *
>>> + * Copyright (C) 2017 Red Hat Inc.
>>> + *
>>> + * Authors: Peter Xu <peterx@redhat.com>,
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License as published by
>>> + * the Free Software Foundation; either version 2 of the License, or
>>> + * (at your option) any later version.
>>> +
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> +
>>> + * You should have received a copy of the GNU General Public License along
>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "hw/core/iommu.h"
>>> +
>>> +void iommu_notifier_register(IOMMUObject *iommu,
>>> +                             IOMMUNotifier *n,
>>> +                             IOMMUNotifyFn fn,
>>> +                             IOMMUEvent event)
>>> +{
>>> +    n->event = event;
>>> +    n->iommu_notify = fn;
>>> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
>>> +    return;
>>> +}
>>> +
>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
>>> +                               IOMMUNotifier *notifier)
>>> +{
>>> +    IOMMUNotifier *cur, *next;
>>> +
>>> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
>>> +        if (cur == notifier) {
>>> +            QLIST_REMOVE(cur, node);
>>> +            break;
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
>>> +{
>>> +    IOMMUNotifier *cur;
>>> +
>>> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
>>> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
>>> +            cur->iommu_notify(cur, event_data);
>>> +        }
>>> +    }
>>> +}
>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>> index 03595e3..8350973 100644
>>> --- a/include/exec/memory.h
>>> +++ b/include/exec/memory.h
>>> @@ -26,6 +26,7 @@
>>>  #include "qom/object.h"
>>>  #include "qemu/rcu.h"
>>>  #include "hw/qdev-core.h"
>>> +#include "hw/core/iommu.h"
>>>  
>>>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
>>>  
>>> @@ -301,6 +302,19 @@ struct MemoryListener {
>>>  };
>>>  
>>>  /**
>>> + * AddressSpaceOps: callbacks structure for address space specific operations
>>> + *
>>> + * @iommu_get: returns an IOMMU object that backs the address space.
>>> + *             Normally this should be NULL for generic address
>>> + *             spaces, and it's only used when there is one
>>> + *             translation unit behind this address space.
>>> + */
>>> +struct AddressSpaceOps {
>>> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
>>> +};
>>> +typedef struct AddressSpaceOps AddressSpaceOps;
>>> +
>>> +/**
>>>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>>>   */
>>>  struct AddressSpace {
>>> @@ -316,6 +330,7 @@ struct AddressSpace {
>>>      struct MemoryRegionIoeventfd *ioeventfds;
>>>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
>>>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>>> +    AddressSpaceOps as_ops;
>>>  };
>>>  
>>>  FlatView *address_space_to_flatview(AddressSpace *as);
>>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
>>>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
>>>  }
>>>  
>>> +/**
>>> + * address_space_iommu_get: Get the backend IOMMU for the address space
>>> + *
>>> + * @as: the address space to fetch IOMMU from
>>> + */
>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
>>> +
>>>  #endif
>>>  
>>>  #endif
>>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
>>> new file mode 100644
>>> index 0000000..34387c0
>>> --- /dev/null
>>> +++ b/include/hw/core/iommu.h
>>> @@ -0,0 +1,73 @@
>>> +/*
>>> + * QEMU emulation of IOMMU logic
>>> + *
>>> + * Copyright (C) 2017 Red Hat Inc.
>>> + *
>>> + * Authors: Peter Xu <peterx@redhat.com>,
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License as published by
>>> + * the Free Software Foundation; either version 2 of the License, or
>>> + * (at your option) any later version.
>>> +
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> +
>>> + * You should have received a copy of the GNU General Public License along
>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#ifndef HW_CORE_IOMMU_H
>>> +#define HW_CORE_IOMMU_H
>>> +
>>> +#include "qemu/queue.h"
>>> +
>>> +enum IOMMUEvent {
>>> +    IOMMU_EVENT_BIND_PASIDT,
>>> +};
>>> +typedef enum IOMMUEvent IOMMUEvent;
>>> +
>>> +struct IOMMUEventData {
>>> +    IOMMUEvent event;
>>> +    uint64_t length;
>>> +    void *data;
>>> +};
>>> +typedef struct IOMMUEventData IOMMUEventData;
>>> +
>>> +typedef struct IOMMUNotifier IOMMUNotifier;
>>> +
>>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
>>> +                              IOMMUEventData *event_data);
>>> +
>>> +struct IOMMUNotifier {
>>> +    IOMMUNotifyFn iommu_notify;
>>> +    /*
>>> +     * What events we are listening to. Let's allow multiple event
>>> +     * registrations from beginning.
>>> +     */
>>> +    IOMMUEvent event;
>>> +    QLIST_ENTRY(IOMMUNotifier) node;
>>> +};
>>> +
>>> +typedef struct IOMMUObject IOMMUObject;
>>> +
>>> +/*
>>> + * This stands for an IOMMU unit. Any translation device should have
>>> + * this struct inside its own structure to make sure it can leverage
>>> + * common IOMMU functionalities.
>>> + */
>>> +struct IOMMUObject {
>>> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
>>> +};
>>> +
>>> +void iommu_notifier_register(IOMMUObject *iommu,
>>> +                             IOMMUNotifier *n,
>>> +                             IOMMUNotifyFn fn,
>>> +                             IOMMUEvent event);
>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
>>> +                               IOMMUNotifier *notifier);
>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
>>> +
>>> +#endif
>>> diff --git a/memory.c b/memory.c
>>> index 77fb3ef..307f665 100644
>>> --- a/memory.c
>>> +++ b/memory.c
>>> @@ -235,8 +235,6 @@ struct FlatView {
>>>      MemoryRegion *root;
>>>  };
>>>  
>>> -typedef struct AddressSpaceOps AddressSpaceOps;
>>> -
>>>  #define FOR_EACH_FLAT_RANGE(var, view)          \
>>>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
>>>  
>>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
>>>      memory_region_unref(as->root);
>>>  }
>>>  
>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
>>> +{
>>> +    if (!as->as_ops.iommu_get) {
>>> +        return NULL;
>>> +    }
>>> +    return as->as_ops.iommu_get(as);
>>> +}
>>> +
>>>  void address_space_destroy(AddressSpace *as)
>>>  {
>>>      MemoryRegion *root = as->root;
>>
>> -- 
>> David Gibson			| I'll have my music baroque, and my code
>> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
>> 				| _way_ _around_!
>> http://www.ozlabs.org/~dgibson
> 
> 
>
Eric Auger Nov. 14, 2017, 10:21 a.m. UTC | #7
Hi Yi L,

On 03/11/2017 13:01, Liu, Yi L wrote:
> From: Peter Xu <peterx@redhat.com>
> 
> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> spaces to store arch-specific hooks.
> 
> The first hook I would like to introduce is iommu_get(). Return an
> IOMMUObject behind the AddressSpace.

David had an objection in the past about this method, saying that
several IOMMUs could translate a single AS?

https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html

On ARM I think it works in general:
In
https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt,
it is said
"a given PCI device can only master through one IOMMU"
> 
> For systems that have IOMMUs, we will create a special address
> space per device which is different from system default address
> space for it (please refer to pci_device_iommu_address_space()).
> Normally when that happens, there will be one specific IOMMU (or
> say, translation unit) stands right behind that new address space.
standing
> 
> This iommu_get() fetches that guy behind the address space. Here,
> the guy is defined as IOMMUObject, which includes a notifier_list
> so far, may extend in future. Along with IOMMUObject, a new iommu
> notifier mechanism is introduced. It would be used for virt-svm.
> Also IOMMUObject can further have a IOMMUObjectOps which is similar
> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
relying
> on MemoryRegion.
> 
I think I would split this patch into a 1 first patch introducing the
iommu object and a second adding the AS ops and address_space_iommu_get()
> Signed-off-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> ---
>  hw/core/Makefile.objs   |  1 +
>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
>  include/exec/memory.h   | 22 +++++++++++++++
>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
>  memory.c                | 10 +++++--
>  5 files changed, 162 insertions(+), 2 deletions(-)
>  create mode 100644 hw/core/iommu.c
>  create mode 100644 include/hw/core/iommu.h
> 
> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> index f8d7a4a..d688412 100644
> --- a/hw/core/Makefile.objs
> +++ b/hw/core/Makefile.objs
> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
>  # irq.o needed for qdev GPIO handling:
>  common-obj-y += irq.o
>  common-obj-y += hotplug.o
> +common-obj-y += iommu.o
>  common-obj-y += nmi.o
>  
>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> new file mode 100644
> index 0000000..7c4fcfe
> --- /dev/null
> +++ b/hw/core/iommu.c
> @@ -0,0 +1,58 @@
> +/*
> + * QEMU emulation of IOMMU logic
May be rephrased as it does not really explain what the iommu object
exposes as an API
> + *
> + * Copyright (C) 2017 Red Hat Inc.
> + *
> + * Authors: Peter Xu <peterx@redhat.com>,
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/core/iommu.h"

> +                             IOMMUNotifier *n)
> +
> +void iommu_notifier_register(IOMMUObject *iommu,
> +                             IOMMUNotifier *n,
> +                             IOMMUNotifyFn fn,
> +                             IOMMUEvent event)
> +{
> +    n->event = event;
> +    n->iommu_notify = fn;
> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> +    return;
> +}
> +
> +void iommu_notifier_unregister(IOMMUObject *iommu,
> +                               IOMMUNotifier *notifier)
> +{
> +    IOMMUNotifier *cur, *next;
> +
> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> +        if (cur == notifier) {
> +            QLIST_REMOVE(cur, node);
> +            break;
> +        }
> +    }
> +}
> +
> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> +{
> +    IOMMUNotifier *cur;
> +
> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
can cur->iommu_notify be NULL if registered as above?
> +            cur->iommu_notify(cur, event_data);
> +        }
> +    }
> +}
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 03595e3..8350973 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -26,6 +26,7 @@
>  #include "qom/object.h"
>  #include "qemu/rcu.h"
>  #include "hw/qdev-core.h"
> +#include "hw/core/iommu.h"
>  
>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
>  
> @@ -301,6 +302,19 @@ struct MemoryListener {
>  };
>  
>  /**
> + * AddressSpaceOps: callbacks structure for address space specific operations
> + *
> + * @iommu_get: returns an IOMMU object that backs the address space.
> + *             Normally this should be NULL for generic address
> + *             spaces, and it's only used when there is one
> + *             translation unit behind this address space.
> + */
> +struct AddressSpaceOps {
> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> +};
> +typedef struct AddressSpaceOps AddressSpaceOps;
> +
> +/**
>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>   */
>  struct AddressSpace {
> @@ -316,6 +330,7 @@ struct AddressSpace {
>      struct MemoryRegionIoeventfd *ioeventfds;
>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> +    AddressSpaceOps as_ops;
>  };
>  
>  FlatView *address_space_to_flatview(AddressSpace *as);
> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
>  }
>  
> +/**
> + * address_space_iommu_get: Get the backend IOMMU for the address space
> + *
> + * @as: the address space to fetch IOMMU from
> + */
> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> +
>  #endif
>  
>  #endif
> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> new file mode 100644
> index 0000000..34387c0
> --- /dev/null
> +++ b/include/hw/core/iommu.h
> @@ -0,0 +1,73 @@
> +/*
> + * QEMU emulation of IOMMU logic
> + *
> + * Copyright (C) 2017 Red Hat Inc.
> + *
> + * Authors: Peter Xu <peterx@redhat.com>,
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_CORE_IOMMU_H
> +#define HW_CORE_IOMMU_H
> +
> +#include "qemu/queue.h"
> +
> +enum IOMMUEvent {
> +    IOMMU_EVENT_BIND_PASIDT,
> +};
> +typedef enum IOMMUEvent IOMMUEvent;
> +
> +struct IOMMUEventData {
> +    IOMMUEvent event;
/* length and opaque data passed to notifiers */ ?
> +    uint64_t length;
> +    void *data;
> +};
> +typedef struct IOMMUEventData IOMMUEventData;
> +
> +typedef struct IOMMUNotifier IOMMUNotifier;
> +
> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> +                              IOMMUEventData *event_data);
> +
> +struct IOMMUNotifier {
> +    IOMMUNotifyFn iommu_notify;
> +    /*
> +     * What events we are listening to. Let's allow multiple event
> +     * registrations from beginning.
> +     */
> +    IOMMUEvent event; 
/* the event the notifier is sensitive to ? */
> +    QLIST_ENTRY(IOMMUNotifier) node;
> +};
> +
> +typedef struct IOMMUObject IOMMUObject;
> +
> +/*
> + * This stands for an IOMMU unit. Any translation device should have
> + * this struct inside its own structure to make sure it can leverage
> + * common IOMMU functionalities.
> + */
> +struct IOMMUObject {
> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
where is the QLIST_INIT supposed to be done?

Thanks

Eric
> +};
> +
> +void iommu_notifier_register(IOMMUObject *iommu,
> +                             IOMMUNotifier *n,
> +                             IOMMUNotifyFn fn,
> +                             IOMMUEvent event);
> +void iommu_notifier_unregister(IOMMUObject *iommu,
> +                               IOMMUNotifier *notifier);
> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> +
> +#endif
> diff --git a/memory.c b/memory.c
> index 77fb3ef..307f665 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -235,8 +235,6 @@ struct FlatView {
>      MemoryRegion *root;
>  };
>  
> -typedef struct AddressSpaceOps AddressSpaceOps;
> -
>  #define FOR_EACH_FLAT_RANGE(var, view)          \
>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
>  
> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
>      memory_region_unref(as->root);
>  }
>  
> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> +{
> +    if (!as->as_ops.iommu_get) {
> +        return NULL;
> +    }
> +    return as->as_ops.iommu_get(as);
> +}
> +
>  void address_space_destroy(AddressSpace *as)
>  {
>      MemoryRegion *root = as->root;
>
Liu, Yi L Nov. 14, 2017, 1:59 p.m. UTC | #8
On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote:
Hi Eric,

> Hi Yi L,
> 
> On 13/11/2017 10:58, Liu, Yi L wrote:
> > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> >> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> >>> From: Peter Xu <peterx@redhat.com>
> >>>
> >>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> >>> spaces to store arch-specific hooks.
> >>>
> >>> The first hook I would like to introduce is iommu_get(). Return an
> >>> IOMMUObject behind the AddressSpace.
> >>>
> >>> For systems that have IOMMUs, we will create a special address
> >>> space per device which is different from system default address
> >>> space for it (please refer to pci_device_iommu_address_space()).
> >>> Normally when that happens, there will be one specific IOMMU (or
> >>> say, translation unit) stands right behind that new address space.
> >>>
> >>> This iommu_get() fetches that guy behind the address space. Here,
> >>> the guy is defined as IOMMUObject, which includes a notifier_list
> >>> so far, may extend in future. Along with IOMMUObject, a new iommu
> >>> notifier mechanism is introduced. It would be used for virt-svm.
> >>> Also IOMMUObject can further have a IOMMUObjectOps which is similar
> >>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> >>> on MemoryRegion.
> >>>
> >>> Signed-off-by: Peter Xu <peterx@redhat.com>
> >>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> >>
> >> Hi, sorry I didn't reply to the earlier postings of this after our
> >> discussion in China.  I've been sick several times and very busy.
> > 
> > Hi David,
> > 
> > Fully understood. I'll try my best to address your question. Also,
> > feel free to input further questions, anyhow, the more we discuss the
> > better work we done.
> > 
> >> I still don't feel like there's an adequate explanation of exactly
> >> what an IOMMUObject represents.   Obviously it can represent more than
> > 
> > IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
> > specific operations. One of the key purpose of IOMMUObject is to
> > introduce a notifier framework to let iommu emulator to be able to
> > do iommu operations other than MAP/UNMAP. As IOMMU grows more and
> > more feature, MAP/UNMAP is not the only operation iommu emulator needs
> > to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
> > has it. may correct me on it. As my cover letter mentioned, MR based
> > notifier framework doesn’t work for the newly added IOMMU operations.
> > Like bind guest pasid table pointer to host and propagate guest's
> > iotlb flush to host.
> > 
> >> a single translation window - since that's represented by the
> >> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> >> are represented by the IOMMUObject have in common, from a functional
> >> point of view.
> > 
> > Let me take virt-SVM as an example. As far as I know, for virt-SVM,
> > the implementation of different vendors are similar. The key design
> > is to have a nested translation(aka. two stage translation). It is to
> > have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
> > mapping. Similar to EPT based virt-MMU solution.
> > 
> > In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
> > keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
> > needs to trap specific guest iommu operation and pass the gVA->gPA
> > mapping knowledge to host through a notifier(newly added one). In VT-d,
> > it is called bind guest pasid table to host.
> 
> What I don't get is the PASID table is per extended context entry. I
> understand this latter is indexed by PCI device function. And today MR
> are created per PCIe device if I am not wrong. 

In my understanding, MR is more related to AddressSpace not exactly tagged
with PCIe device.

> So why can't we have 1
> new MR notifier dedicated to PASID table passing? My understanding is
> the MR, having a 1-1 correspondence with a PCIe device and thus a
> context could be of right granularity. Then I understand the only flags

I didn't quite get your point regards to the "granlarity" here. May talk
a little bit more here?

> we currently have are NONE, MAP and UNMAP but couldn't we add a new one
> for PASID TABLE passing? So this is not crystal clear to me why MR
> notifiers are not adapted to PASID table passing.

This is also my initial plan. You may get some detail in the link below.
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html

In brief, the new notifier I want to add is not really MR related and
just more like a behaviour of a translation unit. Also, as my cover letter
mentioned that the MR notifiers would not be registered for VT-d(virt-svm)
in region_add(). Then it requires to register MR notifiers out of the
region_add(). Also paste below. I think it more or less breaks the
consistency of MR notifiers. Also, I think existing MR notifiers are more
related IOVA address translation(on VT-d it is 2nd level translation, for ARM
is it stage 2?), and there is some existing codes highly rely on this
assumption. e.g. the memory_replay introduced by Peter, the notifier node
would affect the replay. If adding a non MAP/UNMAP notifier, it would break
the logic. So it's also a reason to add a separate framework instead of
just adding a new flag to the existing MR notifier framework.

https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html

+    /* Check if vIOMMU exists */
+    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
+        if (memory_region_is_iommu(subregion)) {
+            IOMMUNotifier n1;
+
+            /*
+             FIXME: current iommu notifier is actually designed for
+             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
+             notifiers other than MAP/UNMAP, so it'll be better to
+             split the non-IOMMUTLB notifier from the current IOMMUTLB
+             notifier framewrok.
+             */
+            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
+                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
+                                0,
+                                0);
+            vfio_register_notifier(group->container,
+                                   subregion,
+                                   0,
+                                   &n1);
+        }
+    }
+


> > 
> > Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
> > needs to propagate it to host. Here, MAP/UNMAP is not suitable since
> > this gVA iotlb flush here doesn’t require to modify host iommu
> > translation table.
> I don't really get this argument. IOMMUNotifier just is a notifier that
> is attached to an IOMMU MR and calls a an IOMMUNotify function, right?

yes, it is.

> Then the role of the function currently is attached to the currently
> existing flags, MAP, UNMAP. This is not linked to an action on the
> physical IOMMU, right?

The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point
here is MAP/UNMAP finally would talk to physical IOMMU change the translation
page table in memory. However, for virt-svm case, the translation page
table is the process vaddr page table(though the I/O page table is also used
we don't need to talk it since hypervisor owns it). process vaddr page table
is owned by guest, changes to the translation page table is by guest. So for
such cache, just need to flush the cache in iommu side. no need to modify
translation page table.

> 
>  At the time gVA iotlb flush is issued, the gVA->gPA
> > mapping has already modified. Host iommu only needs to reference it when
> > performing address translation. But before host iommu perform translation,
> > it needs to flush the old gVA cache. In VT-d, it is called 1st level cache
> > flushing.
> 
> The fact MR notifiers may not be relevant could be linked to the nature
> of the tagging of the structures you want to flush. My understanding is
> if you want to flush by source-id, MR granularity could be fine. Please
> could you clarify why do you need an iommu wide operation in that case.

The flush is not limited to source-id granularity, it would be page selected
and others. As I mentioned, it has no requirement to modify the translation
page table, so it is more like a translation unit behavior.

> > 
> > Both of the two notifiers(operations) has no direct relationship with MR,
> > instead they highly depends on virt-iommu itself.
> As described above this is not obvious to me. Please could you clarify
> why source-id granularity (which I understand has a 1-1 granularity with
> MR/AS is not the correct granularity). Of course, please correct me if
> my understanding of MR mapping is not correct.

It's correct that for the PCIe device, the iova address space(aka PCI address
space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address
space is not limted to iova address space, it has process address space, how
can such an address space relate to a MR...

Thanks,
Yi L
> 
> Thanks
> 
> Eric
> 
>  If virt-iommu exists,
> > then the two notfiers are needed, if not, then it's not.
> >  
> >> Even understanding the SVM stuff better than I did, I don't really see
> >> why an AddressSpace is an obvious unit to have an IOMMUObject
> >> associated with it.
> > 
> > This will benefit the notifier registration. As my comments above, the
> > IOMMUObject is to represent iommu. Associate an AddressSpace with an
> > IOMMUObject makes it easy to check if it is necessary to register the
> > notifiers.
>  If no IOMMUObject, means no virt-iommu exposed to guest, then
> > no need to register notifiers. For this, I also considered to use the
> > MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the
> > subregions and register notfiers if it is an iommu MemoryRegion. Peter
> > mentioned it may not work for SPAR. So he proposed associating an
> > AddressSpace with an IOMMUObject. I think it wroks and easier, so I
> > didn’t object it.
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
> > 
> > +    /* Check if vIOMMU exists */
> > +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
> > +        if (memory_region_is_iommu(subregion)) {
> > +            IOMMUNotifier n1;
> > +
> > +            /*
> > +             FIXME: current iommu notifier is actually designed for
> > +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
> > +             notifiers other than MAP/UNMAP, so it'll be better to
> > +             split the non-IOMMUTLB notifier from the current IOMMUTLB
> > +             notifier framewrok.
> > +             */
> > +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
> > +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
> > +                                0,
> > +                                0);
> > +            vfio_register_notifier(group->container,
> > +                                   subregion,
> > +                                   0,
> > +                                   &n1);
> > +        }
> > +    }
> > +
> > 
> > Thanks,
> > Yi L
> > 
> >>> ---
> >>>  hw/core/Makefile.objs   |  1 +
> >>>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
> >>>  include/exec/memory.h   | 22 +++++++++++++++
> >>>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>  memory.c                | 10 +++++--
> >>>  5 files changed, 162 insertions(+), 2 deletions(-)
> >>>  create mode 100644 hw/core/iommu.c
> >>>  create mode 100644 include/hw/core/iommu.h
> >>>
> >>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> >>> index f8d7a4a..d688412 100644
> >>> --- a/hw/core/Makefile.objs
> >>> +++ b/hw/core/Makefile.objs
> >>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
> >>>  # irq.o needed for qdev GPIO handling:
> >>>  common-obj-y += irq.o
> >>>  common-obj-y += hotplug.o
> >>> +common-obj-y += iommu.o
> >>>  common-obj-y += nmi.o
> >>>  
> >>>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> >>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> >>> new file mode 100644
> >>> index 0000000..7c4fcfe
> >>> --- /dev/null
> >>> +++ b/hw/core/iommu.c
> >>> @@ -0,0 +1,58 @@
> >>> +/*
> >>> + * QEMU emulation of IOMMU logic
> >>> + *
> >>> + * Copyright (C) 2017 Red Hat Inc.
> >>> + *
> >>> + * Authors: Peter Xu <peterx@redhat.com>,
> >>> + *
> >>> + * This program is free software; you can redistribute it and/or modify
> >>> + * it under the terms of the GNU General Public License as published by
> >>> + * the Free Software Foundation; either version 2 of the License, or
> >>> + * (at your option) any later version.
> >>> +
> >>> + * This program is distributed in the hope that it will be useful,
> >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> + * GNU General Public License for more details.
> >>> +
> >>> + * You should have received a copy of the GNU General Public License along
> >>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>> + */
> >>> +
> >>> +#include "qemu/osdep.h"
> >>> +#include "hw/core/iommu.h"
> >>> +
> >>> +void iommu_notifier_register(IOMMUObject *iommu,
> >>> +                             IOMMUNotifier *n,
> >>> +                             IOMMUNotifyFn fn,
> >>> +                             IOMMUEvent event)
> >>> +{
> >>> +    n->event = event;
> >>> +    n->iommu_notify = fn;
> >>> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> >>> +    return;
> >>> +}
> >>> +
> >>> +void iommu_notifier_unregister(IOMMUObject *iommu,
> >>> +                               IOMMUNotifier *notifier)
> >>> +{
> >>> +    IOMMUNotifier *cur, *next;
> >>> +
> >>> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> >>> +        if (cur == notifier) {
> >>> +            QLIST_REMOVE(cur, node);
> >>> +            break;
> >>> +        }
> >>> +    }
> >>> +}
> >>> +
> >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> >>> +{
> >>> +    IOMMUNotifier *cur;
> >>> +
> >>> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> >>> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> >>> +            cur->iommu_notify(cur, event_data);
> >>> +        }
> >>> +    }
> >>> +}
> >>> diff --git a/include/exec/memory.h b/include/exec/memory.h
> >>> index 03595e3..8350973 100644
> >>> --- a/include/exec/memory.h
> >>> +++ b/include/exec/memory.h
> >>> @@ -26,6 +26,7 @@
> >>>  #include "qom/object.h"
> >>>  #include "qemu/rcu.h"
> >>>  #include "hw/qdev-core.h"
> >>> +#include "hw/core/iommu.h"
> >>>  
> >>>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
> >>>  
> >>> @@ -301,6 +302,19 @@ struct MemoryListener {
> >>>  };
> >>>  
> >>>  /**
> >>> + * AddressSpaceOps: callbacks structure for address space specific operations
> >>> + *
> >>> + * @iommu_get: returns an IOMMU object that backs the address space.
> >>> + *             Normally this should be NULL for generic address
> >>> + *             spaces, and it's only used when there is one
> >>> + *             translation unit behind this address space.
> >>> + */
> >>> +struct AddressSpaceOps {
> >>> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> >>> +};
> >>> +typedef struct AddressSpaceOps AddressSpaceOps;
> >>> +
> >>> +/**
> >>>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >>>   */
> >>>  struct AddressSpace {
> >>> @@ -316,6 +330,7 @@ struct AddressSpace {
> >>>      struct MemoryRegionIoeventfd *ioeventfds;
> >>>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
> >>>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >>> +    AddressSpaceOps as_ops;
> >>>  };
> >>>  
> >>>  FlatView *address_space_to_flatview(AddressSpace *as);
> >>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
> >>>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
> >>>  }
> >>>  
> >>> +/**
> >>> + * address_space_iommu_get: Get the backend IOMMU for the address space
> >>> + *
> >>> + * @as: the address space to fetch IOMMU from
> >>> + */
> >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> >>> +
> >>>  #endif
> >>>  
> >>>  #endif
> >>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> >>> new file mode 100644
> >>> index 0000000..34387c0
> >>> --- /dev/null
> >>> +++ b/include/hw/core/iommu.h
> >>> @@ -0,0 +1,73 @@
> >>> +/*
> >>> + * QEMU emulation of IOMMU logic
> >>> + *
> >>> + * Copyright (C) 2017 Red Hat Inc.
> >>> + *
> >>> + * Authors: Peter Xu <peterx@redhat.com>,
> >>> + *
> >>> + * This program is free software; you can redistribute it and/or modify
> >>> + * it under the terms of the GNU General Public License as published by
> >>> + * the Free Software Foundation; either version 2 of the License, or
> >>> + * (at your option) any later version.
> >>> +
> >>> + * This program is distributed in the hope that it will be useful,
> >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> + * GNU General Public License for more details.
> >>> +
> >>> + * You should have received a copy of the GNU General Public License along
> >>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>> + */
> >>> +
> >>> +#ifndef HW_CORE_IOMMU_H
> >>> +#define HW_CORE_IOMMU_H
> >>> +
> >>> +#include "qemu/queue.h"
> >>> +
> >>> +enum IOMMUEvent {
> >>> +    IOMMU_EVENT_BIND_PASIDT,
> >>> +};
> >>> +typedef enum IOMMUEvent IOMMUEvent;
> >>> +
> >>> +struct IOMMUEventData {
> >>> +    IOMMUEvent event;
> >>> +    uint64_t length;
> >>> +    void *data;
> >>> +};
> >>> +typedef struct IOMMUEventData IOMMUEventData;
> >>> +
> >>> +typedef struct IOMMUNotifier IOMMUNotifier;
> >>> +
> >>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> >>> +                              IOMMUEventData *event_data);
> >>> +
> >>> +struct IOMMUNotifier {
> >>> +    IOMMUNotifyFn iommu_notify;
> >>> +    /*
> >>> +     * What events we are listening to. Let's allow multiple event
> >>> +     * registrations from beginning.
> >>> +     */
> >>> +    IOMMUEvent event;
> >>> +    QLIST_ENTRY(IOMMUNotifier) node;
> >>> +};
> >>> +
> >>> +typedef struct IOMMUObject IOMMUObject;
> >>> +
> >>> +/*
> >>> + * This stands for an IOMMU unit. Any translation device should have
> >>> + * this struct inside its own structure to make sure it can leverage
> >>> + * common IOMMU functionalities.
> >>> + */
> >>> +struct IOMMUObject {
> >>> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> >>> +};
> >>> +
> >>> +void iommu_notifier_register(IOMMUObject *iommu,
> >>> +                             IOMMUNotifier *n,
> >>> +                             IOMMUNotifyFn fn,
> >>> +                             IOMMUEvent event);
> >>> +void iommu_notifier_unregister(IOMMUObject *iommu,
> >>> +                               IOMMUNotifier *notifier);
> >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> >>> +
> >>> +#endif
> >>> diff --git a/memory.c b/memory.c
> >>> index 77fb3ef..307f665 100644
> >>> --- a/memory.c
> >>> +++ b/memory.c
> >>> @@ -235,8 +235,6 @@ struct FlatView {
> >>>      MemoryRegion *root;
> >>>  };
> >>>  
> >>> -typedef struct AddressSpaceOps AddressSpaceOps;
> >>> -
> >>>  #define FOR_EACH_FLAT_RANGE(var, view)          \
> >>>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
> >>>  
> >>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
> >>>      memory_region_unref(as->root);
> >>>  }
> >>>  
> >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> >>> +{
> >>> +    if (!as->as_ops.iommu_get) {
> >>> +        return NULL;
> >>> +    }
> >>> +    return as->as_ops.iommu_get(as);
> >>> +}
> >>> +
> >>>  void address_space_destroy(AddressSpace *as)
> >>>  {
> >>>      MemoryRegion *root = as->root;
> >>
> >> -- 
> >> David Gibson			| I'll have my music baroque, and my code
> >> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> >> 				| _way_ _around_!
> >> http://www.ozlabs.org/~dgibson
> > 
> > 
> > 
>
Liu, Yi L Nov. 14, 2017, 2:20 p.m. UTC | #9
Hi Eric,

On Tue, Nov 14, 2017 at 11:21:59AM +0100, Auger Eric wrote:
> Hi Yi L,
> 
> On 03/11/2017 13:01, Liu, Yi L wrote:
> > From: Peter Xu <peterx@redhat.com>
> > 
> > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > spaces to store arch-specific hooks.
> > 
> > The first hook I would like to introduce is iommu_get(). Return an
> > IOMMUObject behind the AddressSpace.
> 
> David had an objection in the past about this method, saying that
> several IOMMUs could translate a single AS?
> 
> https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html
> 
> On ARM I think it works in general:
> In
> https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt,
> it is said
> "a given PCI device can only master through one IOMMU"
> > 
> > For systems that have IOMMUs, we will create a special address
> > space per device which is different from system default address
> > space for it (please refer to pci_device_iommu_address_space()).
> > Normally when that happens, there will be one specific IOMMU (or
> > say, translation unit) stands right behind that new address space.
> standing
> > 
> > This iommu_get() fetches that guy behind the address space. Here,
> > the guy is defined as IOMMUObject, which includes a notifier_list
> > so far, may extend in future. Along with IOMMUObject, a new iommu
> > notifier mechanism is introduced. It would be used for virt-svm.
> > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> relying
> > on MemoryRegion.
> > 
> I think I would split this patch into a 1 first patch introducing the
> iommu object and a second adding the AS ops and address_space_iommu_get()

Good point.

> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > ---
> >  hw/core/Makefile.objs   |  1 +
> >  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
> >  include/exec/memory.h   | 22 +++++++++++++++
> >  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  memory.c                | 10 +++++--
> >  5 files changed, 162 insertions(+), 2 deletions(-)
> >  create mode 100644 hw/core/iommu.c
> >  create mode 100644 include/hw/core/iommu.h
> > 
> > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> > index f8d7a4a..d688412 100644
> > --- a/hw/core/Makefile.objs
> > +++ b/hw/core/Makefile.objs
> > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
> >  # irq.o needed for qdev GPIO handling:
> >  common-obj-y += irq.o
> >  common-obj-y += hotplug.o
> > +common-obj-y += iommu.o
> >  common-obj-y += nmi.o
> >  
> >  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> > diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> > new file mode 100644
> > index 0000000..7c4fcfe
> > --- /dev/null
> > +++ b/hw/core/iommu.c
> > @@ -0,0 +1,58 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> May be rephrased as it does not really explain what the iommu object
> exposes as an API

yes, may need to rephrase to address it clear.

> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "hw/core/iommu.h"
> 
> > +                             IOMMUNotifier *n)
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event)
> > +{
> > +    n->event = event;
> > +    n->iommu_notify = fn;
> > +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> > +    return;
> > +}
> > +
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier)
> > +{
> > +    IOMMUNotifier *cur, *next;
> > +
> > +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> > +        if (cur == notifier) {
> > +            QLIST_REMOVE(cur, node);
> > +            break;
> > +        }
> > +    }
> > +}
> > +
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> > +{
> > +    IOMMUNotifier *cur;
> > +
> > +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> > +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> can cur->iommu_notify be NULL if registered as above?

the cur->event is actually initialized with a event and also a iommu_notify.
So if the cur->event meets the requested event type, then it should be
non-NULL.

> > +            cur->iommu_notify(cur, event_data);
> > +        }
> > +    }
> > +}
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 03595e3..8350973 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -26,6 +26,7 @@
> >  #include "qom/object.h"
> >  #include "qemu/rcu.h"
> >  #include "hw/qdev-core.h"
> > +#include "hw/core/iommu.h"
> >  
> >  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
> >  
> > @@ -301,6 +302,19 @@ struct MemoryListener {
> >  };
> >  
> >  /**
> > + * AddressSpaceOps: callbacks structure for address space specific operations
> > + *
> > + * @iommu_get: returns an IOMMU object that backs the address space.
> > + *             Normally this should be NULL for generic address
> > + *             spaces, and it's only used when there is one
> > + *             translation unit behind this address space.
> > + */
> > +struct AddressSpaceOps {
> > +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> > +};
> > +typedef struct AddressSpaceOps AddressSpaceOps;
> > +
> > +/**
> >   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >   */
> >  struct AddressSpace {
> > @@ -316,6 +330,7 @@ struct AddressSpace {
> >      struct MemoryRegionIoeventfd *ioeventfds;
> >      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> > +    AddressSpaceOps as_ops;
> >  };
> >  
> >  FlatView *address_space_to_flatview(AddressSpace *as);
> > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
> >      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
> >  }
> >  
> > +/**
> > + * address_space_iommu_get: Get the backend IOMMU for the address space
> > + *
> > + * @as: the address space to fetch IOMMU from
> > + */
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> > +
> >  #endif
> >  
> >  #endif
> > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> > new file mode 100644
> > index 0000000..34387c0
> > --- /dev/null
> > +++ b/include/hw/core/iommu.h
> > @@ -0,0 +1,73 @@
> > +/*
> > + * QEMU emulation of IOMMU logic
> > + *
> > + * Copyright (C) 2017 Red Hat Inc.
> > + *
> > + * Authors: Peter Xu <peterx@redhat.com>,
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > +
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > +
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef HW_CORE_IOMMU_H
> > +#define HW_CORE_IOMMU_H
> > +
> > +#include "qemu/queue.h"
> > +
> > +enum IOMMUEvent {
> > +    IOMMU_EVENT_BIND_PASIDT,
> > +};
> > +typedef enum IOMMUEvent IOMMUEvent;
> > +
> > +struct IOMMUEventData {
> > +    IOMMUEvent event;
> /* length and opaque data passed to notifiers */ ?

yes, it is. But the "void *data" would be replaced with well defined structure.
No plan to do it as opaque. Once this patchset is accepted, later patchset
would define it as;

struct IOMMUEventData {
    IOMMUEvent event;
    uint64_t length;
    union {
        struct pasid_table_config pasidt_info;
        struct tlb_invalidate_info tlb_info;
    };
};
typedef struct IOMMUEventData IOMMUEventData;

This is in my further patchset. Currently, we want to show the idea of
introducing this new notifier framework.

> > +    uint64_t length;
> > +    void *data;
> > +};
> > +typedef struct IOMMUEventData IOMMUEventData;
> > +
> > +typedef struct IOMMUNotifier IOMMUNotifier;
> > +
> > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> > +                              IOMMUEventData *event_data);
> > +
> > +struct IOMMUNotifier {
> > +    IOMMUNotifyFn iommu_notify;
> > +    /*
> > +     * What events we are listening to. Let's allow multiple event
> > +     * registrations from beginning.
> > +     */
> > +    IOMMUEvent event; 
> /* the event the notifier is sensitive to ? */

events(aka. operations) like bind pasid table/flush 1st level tlb. etc.

> > +    QLIST_ENTRY(IOMMUNotifier) node;
> > +};
> > +
> > +typedef struct IOMMUObject IOMMUObject;
> > +
> > +/*
> > + * This stands for an IOMMU unit. Any translation device should have
> > + * this struct inside its own structure to make sure it can leverage
> > + * common IOMMU functionalities.
> > + */
> > +struct IOMMUObject {
> > +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> where is the QLIST_INIT supposed to be done?

yeah, need to add it accordingly.

Thanks,
Yi L

> Thanks
> 
> Eric
> > +};
> > +
> > +void iommu_notifier_register(IOMMUObject *iommu,
> > +                             IOMMUNotifier *n,
> > +                             IOMMUNotifyFn fn,
> > +                             IOMMUEvent event);
> > +void iommu_notifier_unregister(IOMMUObject *iommu,
> > +                               IOMMUNotifier *notifier);
> > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> > +
> > +#endif
> > diff --git a/memory.c b/memory.c
> > index 77fb3ef..307f665 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -235,8 +235,6 @@ struct FlatView {
> >      MemoryRegion *root;
> >  };
> >  
> > -typedef struct AddressSpaceOps AddressSpaceOps;
> > -
> >  #define FOR_EACH_FLAT_RANGE(var, view)          \
> >      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
> >  
> > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
> >      memory_region_unref(as->root);
> >  }
> >  
> > +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> > +{
> > +    if (!as->as_ops.iommu_get) {
> > +        return NULL;
> > +    }
> > +    return as->as_ops.iommu_get(as);
> > +}
> > +
> >  void address_space_destroy(AddressSpace *as)
> >  {
> >      MemoryRegion *root = as->root;
> > 
>
Eric Auger Nov. 14, 2017, 9:52 p.m. UTC | #10
Hi Yi L,

On 14/11/2017 14:59, Liu, Yi L wrote:
> On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote:
> Hi Eric,
> 
>> Hi Yi L,
>>
>> On 13/11/2017 10:58, Liu, Yi L wrote:
>>> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
>>>> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
>>>>> From: Peter Xu <peterx@redhat.com>
>>>>>
>>>>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
>>>>> spaces to store arch-specific hooks.
>>>>>
>>>>> The first hook I would like to introduce is iommu_get(). Return an
>>>>> IOMMUObject behind the AddressSpace.
>>>>>
>>>>> For systems that have IOMMUs, we will create a special address
>>>>> space per device which is different from system default address
>>>>> space for it (please refer to pci_device_iommu_address_space()).
>>>>> Normally when that happens, there will be one specific IOMMU (or
>>>>> say, translation unit) stands right behind that new address space.
>>>>>
>>>>> This iommu_get() fetches that guy behind the address space. Here,
>>>>> the guy is defined as IOMMUObject, which includes a notifier_list
>>>>> so far, may extend in future. Along with IOMMUObject, a new iommu
>>>>> notifier mechanism is introduced. It would be used for virt-svm.
>>>>> Also IOMMUObject can further have a IOMMUObjectOps which is similar
>>>>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
>>>>> on MemoryRegion.
>>>>>
>>>>> Signed-off-by: Peter Xu <peterx@redhat.com>
>>>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
>>>>
>>>> Hi, sorry I didn't reply to the earlier postings of this after our
>>>> discussion in China.  I've been sick several times and very busy.
>>>
>>> Hi David,
>>>
>>> Fully understood. I'll try my best to address your question. Also,
>>> feel free to input further questions, anyhow, the more we discuss the
>>> better work we done.
>>>
>>>> I still don't feel like there's an adequate explanation of exactly
>>>> what an IOMMUObject represents.   Obviously it can represent more than
>>>
>>> IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
>>> specific operations. One of the key purpose of IOMMUObject is to
>>> introduce a notifier framework to let iommu emulator to be able to
>>> do iommu operations other than MAP/UNMAP. As IOMMU grows more and
>>> more feature, MAP/UNMAP is not the only operation iommu emulator needs
>>> to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
>>> has it. may correct me on it. As my cover letter mentioned, MR based
>>> notifier framework doesn’t work for the newly added IOMMU operations.
>>> Like bind guest pasid table pointer to host and propagate guest's
>>> iotlb flush to host.
>>>
>>>> a single translation window - since that's represented by the
>>>> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
>>>> are represented by the IOMMUObject have in common, from a functional
>>>> point of view.
>>>
>>> Let me take virt-SVM as an example. As far as I know, for virt-SVM,
>>> the implementation of different vendors are similar. The key design
>>> is to have a nested translation(aka. two stage translation). It is to
>>> have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
>>> mapping. Similar to EPT based virt-MMU solution.
>>>
>>> In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
>>> keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
>>> needs to trap specific guest iommu operation and pass the gVA->gPA
>>> mapping knowledge to host through a notifier(newly added one). In VT-d,
>>> it is called bind guest pasid table to host.
>>
>> What I don't get is the PASID table is per extended context entry. I
>> understand this latter is indexed by PCI device function. And today MR
>> are created per PCIe device if I am not wrong. 
> 
> In my understanding, MR is more related to AddressSpace not exactly tagged
> with PCIe device.
I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
IOMMU MR and 1 AS per PCIe device, right?
> 
>> So why can't we have 1
>> new MR notifier dedicated to PASID table passing? My understanding is
>> the MR, having a 1-1 correspondence with a PCIe device and thus a
>> context could be of right granularity. Then I understand the only flags
> 
> I didn't quite get your point regards to the "granlarity" here. May talk
> a little bit more here?
The PASID table is per device (contained by extended context which is
dev/fn indexed). The "record_device" notifier also is attached to a
specific PCIe device. So we can't really say they have an iommu wide
scope (PCIe device granularity would fit). However I understand from
below explanation that TLB invalidate notifier is not especially tight
to a given source-id as we are going to invalidate by PASID/page.

I think the main justification behind introducing this new framework is
that PT is set along with SVM and in this case the IOMMU MR notifiers
are not registered since the IOMMU MR is disabled.

So new notifiers do not fit nicely in current framework.

> 
>> we currently have are NONE, MAP and UNMAP but couldn't we add a new one
>> for PASID TABLE passing? So this is not crystal clear to me why MR
>> notifiers are not adapted to PASID table passing.
> 
> This is also my initial plan. You may get some detail in the link below.
> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html
> 
> In brief, the new notifier I want to add is not really MR related and
> just more like a behaviour of a translation unit. Also, as my cover letter
> mentioned that the MR notifiers would not be registered for VT-d(virt-svm)
> in region_add(). Then it requires to register MR notifiers out of the
> region_add().

Yes to me this is the main issue: IOMMU MR are not registered in PT
mode. Commit message of "[RFC PATCH 06/20] VFIO: add new notifier for
binding PASID table" helped me to introduce the problem, although I did
not quite understand what you call "vtd_address_space".

 Also paste below. I think it more or less breaks the
> consistency of MR notifiers. Also, I think existing MR notifiers are more
> related IOVA address translation(on VT-d it is 2nd level translation, for ARM
> is it stage 2?),
Well, level 1 input address is IOVA as well.

On ARM, IOVA -> IPA*=GPA is stage1 and GPA -> HPA is stage2. So I would
rather say that for ARM, MR relates to stage1 which is the one emulated
by vSMMU along with VFIO.
*IPA = intermediate physical address.

 and there is some existing codes highly rely on this
> assumption. e.g. the memory_replay introduced by Peter, the notifier node
> would affect the replay. If adding a non MAP/UNMAP notifier, it would break
> the logic. So it's also a reason to add a separate framework instead of
> just adding a new flag to the existing MR notifier framework.


> 
> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
> 
> +    /* Check if vIOMMU exists */
> +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
> +        if (memory_region_is_iommu(subregion)) {
> +            IOMMUNotifier n1;
> +
> +            /*
> +             FIXME: current iommu notifier is actually designed for
> +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
> +             notifiers other than MAP/UNMAP, so it'll be better to
> +             split the non-IOMMUTLB notifier from the current IOMMUTLB
> +             notifier framewrok.
> +             */
> +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
> +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
> +                                0,
> +                                0);
> +            vfio_register_notifier(group->container,
> +                                   subregion,
> +                                   0,
> +                                   &n1);
> +        }
> +    }
> +
> 
> 
>>>
>>> Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
>>> needs to propagate it to host. Here, MAP/UNMAP is not suitable since
>>> this gVA iotlb flush here doesn’t require to modify host iommu
>>> translation table.
>> I don't really get this argument. IOMMUNotifier just is a notifier that
>> is attached to an IOMMU MR and calls a an IOMMUNotify function, right?
> 
> yes, it is.
> 
>> Then the role of the function currently is attached to the currently
>> existing flags, MAP, UNMAP. This is not linked to an action on the
>> physical IOMMU, right?
> 
> The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point
> here is MAP/UNMAP finally would talk to physical IOMMU change the translation
> page table in memory.
yes it programs the pIOMMU with combined translation stages.
 However, for virt-svm case, the translation page
> table is the process vaddr page table(though the I/O page table is also used
> we don't need to talk it since hypervisor owns it). process vaddr page table
> is owned by guest, changes to the translation page table is by guest. So for
> such cache, just need to flush the cache in iommu side. no need to modify
> translation page table.
agreed but the MR notifier is not necessarily supposed to touch table,
right?
> 
>>
>>  At the time gVA iotlb flush is issued, the gVA->gPA
>>> mapping has already modified. Host iommu only needs to reference it when
>>> performing address translation. But before host iommu perform translation,
>>> it needs to flush the old gVA cache. In VT-d, it is called 1st level cache
>>> flushing.
>>
>> The fact MR notifiers may not be relevant could be linked to the nature
>> of the tagging of the structures you want to flush. My understanding is
>> if you want to flush by source-id, MR granularity could be fine. Please
>> could you clarify why do you need an iommu wide operation in that case.
> 
> The flush is not limited to source-id granularity, it would be page selected
> and others. As I mentioned, it has no requirement to modify the translation
> page table, so it is more like a translation unit behavior.
we could also enumerate all MR notifiers - assuming they are registered!
- and call notifiers for each of them, right? Maybe this is not the
strongest argument.
> 
>>>
>>> Both of the two notifiers(operations) has no direct relationship with MR,
>>> instead they highly depends on virt-iommu itself.
>> As described above this is not obvious to me. Please could you clarify
>> why source-id granularity (which I understand has a 1-1 granularity with
>> MR/AS is not the correct granularity). Of course, please correct me if
>> my understanding of MR mapping is not correct.
> 
> It's correct that for the PCIe device, the iova address space(aka PCI address
> space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address
> space is not limted to iova address space, it has process address space, how
> can such an address space relate to a MR...

So, this discussion, reading again the cover letter, and digging into
your original series helped me understand the need for those new
notifiers. What confused me originally was the "iommu" wide notifiers.
Maybe you should clarify your cover letter by explicitly saying that
- SVM works along with PT = 1
- if PT = 1 IOMMU MR are disabled so MR notifier are not registered
- new notifiers do not fit nicely in this framework as they need to be
registered even if PT = 1
- having those notifiers attached to the whole IOMMU object abstraction
is not an issue

Please correct me if I said anything wrong and please apologize if I
brought any further confusion.

Thanks

Eric
> 
> Thanks,
> Yi L
>>
>> Thanks
>>
>> Eric
>>
>>  If virt-iommu exists,
>>> then the two notfiers are needed, if not, then it's not.
>>>  
>>>> Even understanding the SVM stuff better than I did, I don't really see
>>>> why an AddressSpace is an obvious unit to have an IOMMUObject
>>>> associated with it.
>>>
>>> This will benefit the notifier registration. As my comments above, the
>>> IOMMUObject is to represent iommu. Associate an AddressSpace with an
>>> IOMMUObject makes it easy to check if it is necessary to register the
>>> notifiers.
>>  If no IOMMUObject, means no virt-iommu exposed to guest, then
>>> no need to register notifiers. For this, I also considered to use the
>>> MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the
>>> subregions and register notfiers if it is an iommu MemoryRegion. Peter
>>> mentioned it may not work for SPAR. So he proposed associating an
>>> AddressSpace with an IOMMUObject. I think it wroks and easier, so I
>>> didn’t object it.
>>>
>>> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
>>>
>>> +    /* Check if vIOMMU exists */
>>> +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
>>> +        if (memory_region_is_iommu(subregion)) {
>>> +            IOMMUNotifier n1;
>>> +
>>> +            /*
>>> +             FIXME: current iommu notifier is actually designed for
>>> +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
>>> +             notifiers other than MAP/UNMAP, so it'll be better to
>>> +             split the non-IOMMUTLB notifier from the current IOMMUTLB
>>> +             notifier framewrok.
>>> +             */
>>> +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
>>> +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
>>> +                                0,
>>> +                                0);
>>> +            vfio_register_notifier(group->container,
>>> +                                   subregion,
>>> +                                   0,
>>> +                                   &n1);
>>> +        }
>>> +    }
>>> +
>>>
>>> Thanks,
>>> Yi L
>>>
>>>>> ---
>>>>>  hw/core/Makefile.objs   |  1 +
>>>>>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
>>>>>  include/exec/memory.h   | 22 +++++++++++++++
>>>>>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  memory.c                | 10 +++++--
>>>>>  5 files changed, 162 insertions(+), 2 deletions(-)
>>>>>  create mode 100644 hw/core/iommu.c
>>>>>  create mode 100644 include/hw/core/iommu.h
>>>>>
>>>>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
>>>>> index f8d7a4a..d688412 100644
>>>>> --- a/hw/core/Makefile.objs
>>>>> +++ b/hw/core/Makefile.objs
>>>>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
>>>>>  # irq.o needed for qdev GPIO handling:
>>>>>  common-obj-y += irq.o
>>>>>  common-obj-y += hotplug.o
>>>>> +common-obj-y += iommu.o
>>>>>  common-obj-y += nmi.o
>>>>>  
>>>>>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
>>>>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
>>>>> new file mode 100644
>>>>> index 0000000..7c4fcfe
>>>>> --- /dev/null
>>>>> +++ b/hw/core/iommu.c
>>>>> @@ -0,0 +1,58 @@
>>>>> +/*
>>>>> + * QEMU emulation of IOMMU logic
>>>>> + *
>>>>> + * Copyright (C) 2017 Red Hat Inc.
>>>>> + *
>>>>> + * Authors: Peter Xu <peterx@redhat.com>,
>>>>> + *
>>>>> + * This program is free software; you can redistribute it and/or modify
>>>>> + * it under the terms of the GNU General Public License as published by
>>>>> + * the Free Software Foundation; either version 2 of the License, or
>>>>> + * (at your option) any later version.
>>>>> +
>>>>> + * This program is distributed in the hope that it will be useful,
>>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> + * GNU General Public License for more details.
>>>>> +
>>>>> + * You should have received a copy of the GNU General Public License along
>>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>>> + */
>>>>> +
>>>>> +#include "qemu/osdep.h"
>>>>> +#include "hw/core/iommu.h"
>>>>> +
>>>>> +void iommu_notifier_register(IOMMUObject *iommu,
>>>>> +                             IOMMUNotifier *n,
>>>>> +                             IOMMUNotifyFn fn,
>>>>> +                             IOMMUEvent event)
>>>>> +{
>>>>> +    n->event = event;
>>>>> +    n->iommu_notify = fn;
>>>>> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
>>>>> +    return;
>>>>> +}
>>>>> +
>>>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
>>>>> +                               IOMMUNotifier *notifier)
>>>>> +{
>>>>> +    IOMMUNotifier *cur, *next;
>>>>> +
>>>>> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
>>>>> +        if (cur == notifier) {
>>>>> +            QLIST_REMOVE(cur, node);
>>>>> +            break;
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
>>>>> +{
>>>>> +    IOMMUNotifier *cur;
>>>>> +
>>>>> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
>>>>> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
>>>>> +            cur->iommu_notify(cur, event_data);
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>>>> index 03595e3..8350973 100644
>>>>> --- a/include/exec/memory.h
>>>>> +++ b/include/exec/memory.h
>>>>> @@ -26,6 +26,7 @@
>>>>>  #include "qom/object.h"
>>>>>  #include "qemu/rcu.h"
>>>>>  #include "hw/qdev-core.h"
>>>>> +#include "hw/core/iommu.h"
>>>>>  
>>>>>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
>>>>>  
>>>>> @@ -301,6 +302,19 @@ struct MemoryListener {
>>>>>  };
>>>>>  
>>>>>  /**
>>>>> + * AddressSpaceOps: callbacks structure for address space specific operations
>>>>> + *
>>>>> + * @iommu_get: returns an IOMMU object that backs the address space.
>>>>> + *             Normally this should be NULL for generic address
>>>>> + *             spaces, and it's only used when there is one
>>>>> + *             translation unit behind this address space.
>>>>> + */
>>>>> +struct AddressSpaceOps {
>>>>> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
>>>>> +};
>>>>> +typedef struct AddressSpaceOps AddressSpaceOps;
>>>>> +
>>>>> +/**
>>>>>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>>>>>   */
>>>>>  struct AddressSpace {
>>>>> @@ -316,6 +330,7 @@ struct AddressSpace {
>>>>>      struct MemoryRegionIoeventfd *ioeventfds;
>>>>>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
>>>>>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>>>>> +    AddressSpaceOps as_ops;
>>>>>  };
>>>>>  
>>>>>  FlatView *address_space_to_flatview(AddressSpace *as);
>>>>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
>>>>>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
>>>>>  }
>>>>>  
>>>>> +/**
>>>>> + * address_space_iommu_get: Get the backend IOMMU for the address space
>>>>> + *
>>>>> + * @as: the address space to fetch IOMMU from
>>>>> + */
>>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
>>>>> +
>>>>>  #endif
>>>>>  
>>>>>  #endif
>>>>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
>>>>> new file mode 100644
>>>>> index 0000000..34387c0
>>>>> --- /dev/null
>>>>> +++ b/include/hw/core/iommu.h
>>>>> @@ -0,0 +1,73 @@
>>>>> +/*
>>>>> + * QEMU emulation of IOMMU logic
>>>>> + *
>>>>> + * Copyright (C) 2017 Red Hat Inc.
>>>>> + *
>>>>> + * Authors: Peter Xu <peterx@redhat.com>,
>>>>> + *
>>>>> + * This program is free software; you can redistribute it and/or modify
>>>>> + * it under the terms of the GNU General Public License as published by
>>>>> + * the Free Software Foundation; either version 2 of the License, or
>>>>> + * (at your option) any later version.
>>>>> +
>>>>> + * This program is distributed in the hope that it will be useful,
>>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> + * GNU General Public License for more details.
>>>>> +
>>>>> + * You should have received a copy of the GNU General Public License along
>>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>>> + */
>>>>> +
>>>>> +#ifndef HW_CORE_IOMMU_H
>>>>> +#define HW_CORE_IOMMU_H
>>>>> +
>>>>> +#include "qemu/queue.h"
>>>>> +
>>>>> +enum IOMMUEvent {
>>>>> +    IOMMU_EVENT_BIND_PASIDT,
>>>>> +};
>>>>> +typedef enum IOMMUEvent IOMMUEvent;
>>>>> +
>>>>> +struct IOMMUEventData {
>>>>> +    IOMMUEvent event;
>>>>> +    uint64_t length;
>>>>> +    void *data;
>>>>> +};
>>>>> +typedef struct IOMMUEventData IOMMUEventData;
>>>>> +
>>>>> +typedef struct IOMMUNotifier IOMMUNotifier;
>>>>> +
>>>>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
>>>>> +                              IOMMUEventData *event_data);
>>>>> +
>>>>> +struct IOMMUNotifier {
>>>>> +    IOMMUNotifyFn iommu_notify;
>>>>> +    /*
>>>>> +     * What events we are listening to. Let's allow multiple event
>>>>> +     * registrations from beginning.
>>>>> +     */
>>>>> +    IOMMUEvent event;
>>>>> +    QLIST_ENTRY(IOMMUNotifier) node;
>>>>> +};
>>>>> +
>>>>> +typedef struct IOMMUObject IOMMUObject;
>>>>> +
>>>>> +/*
>>>>> + * This stands for an IOMMU unit. Any translation device should have
>>>>> + * this struct inside its own structure to make sure it can leverage
>>>>> + * common IOMMU functionalities.
>>>>> + */
>>>>> +struct IOMMUObject {
>>>>> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
>>>>> +};
>>>>> +
>>>>> +void iommu_notifier_register(IOMMUObject *iommu,
>>>>> +                             IOMMUNotifier *n,
>>>>> +                             IOMMUNotifyFn fn,
>>>>> +                             IOMMUEvent event);
>>>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
>>>>> +                               IOMMUNotifier *notifier);
>>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
>>>>> +
>>>>> +#endif
>>>>> diff --git a/memory.c b/memory.c
>>>>> index 77fb3ef..307f665 100644
>>>>> --- a/memory.c
>>>>> +++ b/memory.c
>>>>> @@ -235,8 +235,6 @@ struct FlatView {
>>>>>      MemoryRegion *root;
>>>>>  };
>>>>>  
>>>>> -typedef struct AddressSpaceOps AddressSpaceOps;
>>>>> -
>>>>>  #define FOR_EACH_FLAT_RANGE(var, view)          \
>>>>>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
>>>>>  
>>>>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
>>>>>      memory_region_unref(as->root);
>>>>>  }
>>>>>  
>>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
>>>>> +{
>>>>> +    if (!as->as_ops.iommu_get) {
>>>>> +        return NULL;
>>>>> +    }
>>>>> +    return as->as_ops.iommu_get(as);
>>>>> +}
>>>>> +
>>>>>  void address_space_destroy(AddressSpace *as)
>>>>>  {
>>>>>      MemoryRegion *root = as->root;
>>>>
>>>> -- 
>>>> David Gibson			| I'll have my music baroque, and my code
>>>> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
>>>> 				| _way_ _around_!
>>>> http://www.ozlabs.org/~dgibson
>>>
>>>
>>>
>>
>
Liu, Yi L Nov. 15, 2017, 2:36 a.m. UTC | #11
Hi Eric,

On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> Hi Yi L,
> 
> On 14/11/2017 14:59, Liu, Yi L wrote:
> > On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote:
> > Hi Eric,
> > 
> >> Hi Yi L,
> >>
> >> On 13/11/2017 10:58, Liu, Yi L wrote:
> >>> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> >>>> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> >>>>> From: Peter Xu <peterx@redhat.com>
> >>>>>
> >>>>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> >>>>> spaces to store arch-specific hooks.
> >>>>>
> >>>>> The first hook I would like to introduce is iommu_get(). Return an
> >>>>> IOMMUObject behind the AddressSpace.
> >>>>>
> >>>>> For systems that have IOMMUs, we will create a special address
> >>>>> space per device which is different from system default address
> >>>>> space for it (please refer to pci_device_iommu_address_space()).
> >>>>> Normally when that happens, there will be one specific IOMMU (or
> >>>>> say, translation unit) stands right behind that new address space.
> >>>>>
> >>>>> This iommu_get() fetches that guy behind the address space. Here,
> >>>>> the guy is defined as IOMMUObject, which includes a notifier_list
> >>>>> so far, may extend in future. Along with IOMMUObject, a new iommu
> >>>>> notifier mechanism is introduced. It would be used for virt-svm.
> >>>>> Also IOMMUObject can further have a IOMMUObjectOps which is similar
> >>>>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> >>>>> on MemoryRegion.
> >>>>>
> >>>>> Signed-off-by: Peter Xu <peterx@redhat.com>
> >>>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> >>>>
> >>>> Hi, sorry I didn't reply to the earlier postings of this after our
> >>>> discussion in China.  I've been sick several times and very busy.
> >>>
> >>> Hi David,
> >>>
> >>> Fully understood. I'll try my best to address your question. Also,
> >>> feel free to input further questions, anyhow, the more we discuss the
> >>> better work we done.
> >>>
> >>>> I still don't feel like there's an adequate explanation of exactly
> >>>> what an IOMMUObject represents.   Obviously it can represent more than
> >>>
> >>> IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
> >>> specific operations. One of the key purpose of IOMMUObject is to
> >>> introduce a notifier framework to let iommu emulator to be able to
> >>> do iommu operations other than MAP/UNMAP. As IOMMU grows more and
> >>> more feature, MAP/UNMAP is not the only operation iommu emulator needs
> >>> to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
> >>> has it. may correct me on it. As my cover letter mentioned, MR based
> >>> notifier framework doesn’t work for the newly added IOMMU operations.
> >>> Like bind guest pasid table pointer to host and propagate guest's
> >>> iotlb flush to host.
> >>>
> >>>> a single translation window - since that's represented by the
> >>>> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> >>>> are represented by the IOMMUObject have in common, from a functional
> >>>> point of view.
> >>>
> >>> Let me take virt-SVM as an example. As far as I know, for virt-SVM,
> >>> the implementation of different vendors are similar. The key design
> >>> is to have a nested translation(aka. two stage translation). It is to
> >>> have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
> >>> mapping. Similar to EPT based virt-MMU solution.
> >>>
> >>> In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
> >>> keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
> >>> needs to trap specific guest iommu operation and pass the gVA->gPA
> >>> mapping knowledge to host through a notifier(newly added one). In VT-d,
> >>> it is called bind guest pasid table to host.
> >>
> >> What I don't get is the PASID table is per extended context entry. I
> >> understand this latter is indexed by PCI device function. And today MR
> >> are created per PCIe device if I am not wrong. 
> > 
> > In my understanding, MR is more related to AddressSpace not exactly tagged
> > with PCIe device.
> I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
> IOMMU MR and 1 AS per PCIe device, right?

yes, it is. This is the PCIe device address space, it's can be guest's physical
address space if no vIOMMU exposed. Or it can be the guset IOVA address space
if vIOMMU is epxosed. Both the two address space is treated as 2nd level
transaltion in VT-d, which is different from the 1st level translation which
is for process vaddr space.

> > 
> >> So why can't we have 1
> >> new MR notifier dedicated to PASID table passing? My understanding is
> >> the MR, having a 1-1 correspondence with a PCIe device and thus a
> >> context could be of right granularity. Then I understand the only flags
> > 
> > I didn't quite get your point regards to the "granlarity" here. May talk
> > a little bit more here?
> The PASID table is per device (contained by extended context which is
> dev/fn indexed). The "record_device" notifier also is attached to a
> specific PCIe device. So we can't really say they have an iommu wide
> scope (PCIe device granularity would fit). However I understand from
> below explanation that TLB invalidate notifier is not especially tight
> to a given source-id as we are going to invalidate by PASID/page.
> 

correct, it has no tight relationship to the "granlarity" here for the
discussion to introduce this new notifer framework.

> I think the main justification behind introducing this new framework is
> that PT is set along with SVM and in this case the IOMMU MR notifiers
> are not registered since the IOMMU MR is disabled.
> 
> So new notifiers do not fit nicely in current framework.

exactly, that's what I mean. Although we have tricky to fix it, but the trick
is still a trick. Better to have a new and dedicate framework.

> > 
> >> we currently have are NONE, MAP and UNMAP but couldn't we add a new one
> >> for PASID TABLE passing? So this is not crystal clear to me why MR
> >> notifiers are not adapted to PASID table passing.
> > 
> > This is also my initial plan. You may get some detail in the link below.
> > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html
> > 
> > In brief, the new notifier I want to add is not really MR related and
> > just more like a behaviour of a translation unit. Also, as my cover letter
> > mentioned that the MR notifiers would not be registered for VT-d(virt-svm)
> > in region_add(). Then it requires to register MR notifiers out of the
> > region_add().
> 
> Yes to me this is the main issue: IOMMU MR are not registered in PT
> mode. Commit message of "[RFC PATCH 06/20] VFIO: add new notifier for
> binding PASID table" helped me to introduce the problem, although I did
> not quite understand what you call "vtd_address_space".

"vtd_address_space" is a wrapper of the DMA AddressSpace of a device behind
a virtual VT-d.

>  Also paste below. I think it more or less breaks the
> > consistency of MR notifiers. Also, I think existing MR notifiers are more
> > related IOVA address translation(on VT-d it is 2nd level translation, for ARM
> > is it stage 2?),
> Well, level 1 input address is IOVA as well.
> 
> On ARM, IOVA -> IPA*=GPA is stage1 and GPA -> HPA is stage2. So I would
> rather say that for ARM, MR relates to stage1 which is the one emulated
> by vSMMU along with VFIO.
> *IPA = intermediate physical address.

thx for correction. BTW. if you have virt-SVM on ARM, will stage 1 cover the
gVA->gPA/IPA translation?

>  and there is some existing codes highly rely on this
> > assumption. e.g. the memory_replay introduced by Peter, the notifier node
> > would affect the replay. If adding a non MAP/UNMAP notifier, it would break
> > the logic. So it's also a reason to add a separate framework instead of
> > just adding a new flag to the existing MR notifier framework.
> 
> 
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
> > 
> > +    /* Check if vIOMMU exists */
> > +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
> > +        if (memory_region_is_iommu(subregion)) {
> > +            IOMMUNotifier n1;
> > +
> > +            /*
> > +             FIXME: current iommu notifier is actually designed for
> > +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
> > +             notifiers other than MAP/UNMAP, so it'll be better to
> > +             split the non-IOMMUTLB notifier from the current IOMMUTLB
> > +             notifier framewrok.
> > +             */
> > +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
> > +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
> > +                                0,
> > +                                0);
> > +            vfio_register_notifier(group->container,
> > +                                   subregion,
> > +                                   0,
> > +                                   &n1);
> > +        }
> > +    }
> > +
> > 
> > 
> >>>
> >>> Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
> >>> needs to propagate it to host. Here, MAP/UNMAP is not suitable since
> >>> this gVA iotlb flush here doesn’t require to modify host iommu
> >>> translation table.
> >> I don't really get this argument. IOMMUNotifier just is a notifier that
> >> is attached to an IOMMU MR and calls a an IOMMUNotify function, right?
> > 
> > yes, it is.
> > 
> >> Then the role of the function currently is attached to the currently
> >> existing flags, MAP, UNMAP. This is not linked to an action on the
> >> physical IOMMU, right?
> > 
> > The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point
> > here is MAP/UNMAP finally would talk to physical IOMMU change the translation
> > page table in memory.
> yes it programs the pIOMMU with combined translation stages.
>  However, for virt-svm case, the translation page
> > table is the process vaddr page table(though the I/O page table is also used
> > we don't need to talk it since hypervisor owns it). process vaddr page table
> > is owned by guest, changes to the translation page table is by guest. So for
> > such cache, just need to flush the cache in iommu side. no need to modify
> > translation page table.
> agreed but the MR notifier is not necessarily supposed to touch table,
> right?

Not sure for other platform, for VT-d, the existing MAP/UNMAP notifier would
not touch the page table in Qemu, but they would touch through VFIO APIs. And
I think it's necessary when it's registered. e.g. when virtual VT-d is exposed,
guest can use guset IOVA, requires MAP/UNMAP APIs to shadow the mapping guest
iommu driver builds.

> > 
> >>
> >>  At the time gVA iotlb flush is issued, the gVA->gPA
> >>> mapping has already modified. Host iommu only needs to reference it when
> >>> performing address translation. But before host iommu perform translation,
> >>> it needs to flush the old gVA cache. In VT-d, it is called 1st level cache
> >>> flushing.
> >>
> >> The fact MR notifiers may not be relevant could be linked to the nature
> >> of the tagging of the structures you want to flush. My understanding is
> >> if you want to flush by source-id, MR granularity could be fine. Please
> >> could you clarify why do you need an iommu wide operation in that case.
> > 
> > The flush is not limited to source-id granularity, it would be page selected
> > and others. As I mentioned, it has no requirement to modify the translation
> > page table, so it is more like a translation unit behavior.
> we could also enumerate all MR notifiers - assuming they are registered!
> - and call notifiers for each of them, right? Maybe this is not the
> strongest argument.

yes, it's not the strongest argument for the implementaion. But for design,
it is a strong argument to address the necessarity across different platforms.

> > 
> >>>
> >>> Both of the two notifiers(operations) has no direct relationship with MR,
> >>> instead they highly depends on virt-iommu itself.
> >> As described above this is not obvious to me. Please could you clarify
> >> why source-id granularity (which I understand has a 1-1 granularity with
> >> MR/AS is not the correct granularity). Of course, please correct me if
> >> my understanding of MR mapping is not correct.
> > 
> > It's correct that for the PCIe device, the iova address space(aka PCI address
> > space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address
> > space is not limted to iova address space, it has process address space, how
> > can such an address space relate to a MR...
> 
> So, this discussion, reading again the cover letter, and digging into
> your original series helped me understand the need for those new
> notifiers. What confused me originally was the "iommu" wide notifiers.
> Maybe you should clarify your cover letter by explicitly saying that
> - SVM works along with PT = 1
> - if PT = 1 IOMMU MR are disabled so MR notifier are not registered
> - new notifiers do not fit nicely in this framework as they need to be
> registered even if PT = 1
> - having those notifiers attached to the whole IOMMU object abstraction
> is not an issue
> 
> Please correct me if I said anything wrong and please apologize if I
> brought any further confusion.

yes, you're right. should have written it in a better manner. Would update the
cover-letter accordingly in next version. Also, feel free to let me know if it
meets the requirements of the pv work you're doing.

Thanks,
Yi L

> 
> Thanks
> 
> Eric
> > 
> > Thanks,
> > Yi L
> >>
> >> Thanks
> >>
> >> Eric
> >>
> >>  If virt-iommu exists,
> >>> then the two notfiers are needed, if not, then it's not.
> >>>  
> >>>> Even understanding the SVM stuff better than I did, I don't really see
> >>>> why an AddressSpace is an obvious unit to have an IOMMUObject
> >>>> associated with it.
> >>>
> >>> This will benefit the notifier registration. As my comments above, the
> >>> IOMMUObject is to represent iommu. Associate an AddressSpace with an
> >>> IOMMUObject makes it easy to check if it is necessary to register the
> >>> notifiers.
> >>  If no IOMMUObject, means no virt-iommu exposed to guest, then
> >>> no need to register notifiers. For this, I also considered to use the
> >>> MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the
> >>> subregions and register notfiers if it is an iommu MemoryRegion. Peter
> >>> mentioned it may not work for SPAR. So he proposed associating an
> >>> AddressSpace with an IOMMUObject. I think it wroks and easier, so I
> >>> didn’t object it.
> >>>
> >>> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html
> >>>
> >>> +    /* Check if vIOMMU exists */
> >>> +    QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) {
> >>> +        if (memory_region_is_iommu(subregion)) {
> >>> +            IOMMUNotifier n1;
> >>> +
> >>> +            /*
> >>> +             FIXME: current iommu notifier is actually designed for
> >>> +             IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need
> >>> +             notifiers other than MAP/UNMAP, so it'll be better to
> >>> +             split the non-IOMMUTLB notifier from the current IOMMUTLB
> >>> +             notifier framewrok.
> >>> +             */
> >>> +            iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify,
> >>> +                                IOMMU_NOTIFIER_SVM_PASIDT_BIND,
> >>> +                                0,
> >>> +                                0);
> >>> +            vfio_register_notifier(group->container,
> >>> +                                   subregion,
> >>> +                                   0,
> >>> +                                   &n1);
> >>> +        }
> >>> +    }
> >>> +
> >>>
> >>> Thanks,
> >>> Yi L
> >>>
> >>>>> ---
> >>>>>  hw/core/Makefile.objs   |  1 +
> >>>>>  hw/core/iommu.c         | 58 +++++++++++++++++++++++++++++++++++++++
> >>>>>  include/exec/memory.h   | 22 +++++++++++++++
> >>>>>  include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>  memory.c                | 10 +++++--
> >>>>>  5 files changed, 162 insertions(+), 2 deletions(-)
> >>>>>  create mode 100644 hw/core/iommu.c
> >>>>>  create mode 100644 include/hw/core/iommu.h
> >>>>>
> >>>>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
> >>>>> index f8d7a4a..d688412 100644
> >>>>> --- a/hw/core/Makefile.objs
> >>>>> +++ b/hw/core/Makefile.objs
> >>>>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o
> >>>>>  # irq.o needed for qdev GPIO handling:
> >>>>>  common-obj-y += irq.o
> >>>>>  common-obj-y += hotplug.o
> >>>>> +common-obj-y += iommu.o
> >>>>>  common-obj-y += nmi.o
> >>>>>  
> >>>>>  common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
> >>>>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c
> >>>>> new file mode 100644
> >>>>> index 0000000..7c4fcfe
> >>>>> --- /dev/null
> >>>>> +++ b/hw/core/iommu.c
> >>>>> @@ -0,0 +1,58 @@
> >>>>> +/*
> >>>>> + * QEMU emulation of IOMMU logic
> >>>>> + *
> >>>>> + * Copyright (C) 2017 Red Hat Inc.
> >>>>> + *
> >>>>> + * Authors: Peter Xu <peterx@redhat.com>,
> >>>>> + *
> >>>>> + * This program is free software; you can redistribute it and/or modify
> >>>>> + * it under the terms of the GNU General Public License as published by
> >>>>> + * the Free Software Foundation; either version 2 of the License, or
> >>>>> + * (at your option) any later version.
> >>>>> +
> >>>>> + * This program is distributed in the hope that it will be useful,
> >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>>>> + * GNU General Public License for more details.
> >>>>> +
> >>>>> + * You should have received a copy of the GNU General Public License along
> >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>>>> + */
> >>>>> +
> >>>>> +#include "qemu/osdep.h"
> >>>>> +#include "hw/core/iommu.h"
> >>>>> +
> >>>>> +void iommu_notifier_register(IOMMUObject *iommu,
> >>>>> +                             IOMMUNotifier *n,
> >>>>> +                             IOMMUNotifyFn fn,
> >>>>> +                             IOMMUEvent event)
> >>>>> +{
> >>>>> +    n->event = event;
> >>>>> +    n->iommu_notify = fn;
> >>>>> +    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
> >>>>> +    return;
> >>>>> +}
> >>>>> +
> >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
> >>>>> +                               IOMMUNotifier *notifier)
> >>>>> +{
> >>>>> +    IOMMUNotifier *cur, *next;
> >>>>> +
> >>>>> +    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
> >>>>> +        if (cur == notifier) {
> >>>>> +            QLIST_REMOVE(cur, node);
> >>>>> +            break;
> >>>>> +        }
> >>>>> +    }
> >>>>> +}
> >>>>> +
> >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
> >>>>> +{
> >>>>> +    IOMMUNotifier *cur;
> >>>>> +
> >>>>> +    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
> >>>>> +        if ((cur->event == event_data->event) && cur->iommu_notify) {
> >>>>> +            cur->iommu_notify(cur, event_data);
> >>>>> +        }
> >>>>> +    }
> >>>>> +}
> >>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
> >>>>> index 03595e3..8350973 100644
> >>>>> --- a/include/exec/memory.h
> >>>>> +++ b/include/exec/memory.h
> >>>>> @@ -26,6 +26,7 @@
> >>>>>  #include "qom/object.h"
> >>>>>  #include "qemu/rcu.h"
> >>>>>  #include "hw/qdev-core.h"
> >>>>> +#include "hw/core/iommu.h"
> >>>>>  
> >>>>>  #define RAM_ADDR_INVALID (~(ram_addr_t)0)
> >>>>>  
> >>>>> @@ -301,6 +302,19 @@ struct MemoryListener {
> >>>>>  };
> >>>>>  
> >>>>>  /**
> >>>>> + * AddressSpaceOps: callbacks structure for address space specific operations
> >>>>> + *
> >>>>> + * @iommu_get: returns an IOMMU object that backs the address space.
> >>>>> + *             Normally this should be NULL for generic address
> >>>>> + *             spaces, and it's only used when there is one
> >>>>> + *             translation unit behind this address space.
> >>>>> + */
> >>>>> +struct AddressSpaceOps {
> >>>>> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> >>>>> +};
> >>>>> +typedef struct AddressSpaceOps AddressSpaceOps;
> >>>>> +
> >>>>> +/**
> >>>>>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >>>>>   */
> >>>>>  struct AddressSpace {
> >>>>> @@ -316,6 +330,7 @@ struct AddressSpace {
> >>>>>      struct MemoryRegionIoeventfd *ioeventfds;
> >>>>>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
> >>>>>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >>>>> +    AddressSpaceOps as_ops;
> >>>>>  };
> >>>>>  
> >>>>>  FlatView *address_space_to_flatview(AddressSpace *as);
> >>>>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
> >>>>>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
> >>>>>  }
> >>>>>  
> >>>>> +/**
> >>>>> + * address_space_iommu_get: Get the backend IOMMU for the address space
> >>>>> + *
> >>>>> + * @as: the address space to fetch IOMMU from
> >>>>> + */
> >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> >>>>> +
> >>>>>  #endif
> >>>>>  
> >>>>>  #endif
> >>>>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
> >>>>> new file mode 100644
> >>>>> index 0000000..34387c0
> >>>>> --- /dev/null
> >>>>> +++ b/include/hw/core/iommu.h
> >>>>> @@ -0,0 +1,73 @@
> >>>>> +/*
> >>>>> + * QEMU emulation of IOMMU logic
> >>>>> + *
> >>>>> + * Copyright (C) 2017 Red Hat Inc.
> >>>>> + *
> >>>>> + * Authors: Peter Xu <peterx@redhat.com>,
> >>>>> + *
> >>>>> + * This program is free software; you can redistribute it and/or modify
> >>>>> + * it under the terms of the GNU General Public License as published by
> >>>>> + * the Free Software Foundation; either version 2 of the License, or
> >>>>> + * (at your option) any later version.
> >>>>> +
> >>>>> + * This program is distributed in the hope that it will be useful,
> >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>>>> + * GNU General Public License for more details.
> >>>>> +
> >>>>> + * You should have received a copy of the GNU General Public License along
> >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >>>>> + */
> >>>>> +
> >>>>> +#ifndef HW_CORE_IOMMU_H
> >>>>> +#define HW_CORE_IOMMU_H
> >>>>> +
> >>>>> +#include "qemu/queue.h"
> >>>>> +
> >>>>> +enum IOMMUEvent {
> >>>>> +    IOMMU_EVENT_BIND_PASIDT,
> >>>>> +};
> >>>>> +typedef enum IOMMUEvent IOMMUEvent;
> >>>>> +
> >>>>> +struct IOMMUEventData {
> >>>>> +    IOMMUEvent event;
> >>>>> +    uint64_t length;
> >>>>> +    void *data;
> >>>>> +};
> >>>>> +typedef struct IOMMUEventData IOMMUEventData;
> >>>>> +
> >>>>> +typedef struct IOMMUNotifier IOMMUNotifier;
> >>>>> +
> >>>>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
> >>>>> +                              IOMMUEventData *event_data);
> >>>>> +
> >>>>> +struct IOMMUNotifier {
> >>>>> +    IOMMUNotifyFn iommu_notify;
> >>>>> +    /*
> >>>>> +     * What events we are listening to. Let's allow multiple event
> >>>>> +     * registrations from beginning.
> >>>>> +     */
> >>>>> +    IOMMUEvent event;
> >>>>> +    QLIST_ENTRY(IOMMUNotifier) node;
> >>>>> +};
> >>>>> +
> >>>>> +typedef struct IOMMUObject IOMMUObject;
> >>>>> +
> >>>>> +/*
> >>>>> + * This stands for an IOMMU unit. Any translation device should have
> >>>>> + * this struct inside its own structure to make sure it can leverage
> >>>>> + * common IOMMU functionalities.
> >>>>> + */
> >>>>> +struct IOMMUObject {
> >>>>> +    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
> >>>>> +};
> >>>>> +
> >>>>> +void iommu_notifier_register(IOMMUObject *iommu,
> >>>>> +                             IOMMUNotifier *n,
> >>>>> +                             IOMMUNotifyFn fn,
> >>>>> +                             IOMMUEvent event);
> >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu,
> >>>>> +                               IOMMUNotifier *notifier);
> >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
> >>>>> +
> >>>>> +#endif
> >>>>> diff --git a/memory.c b/memory.c
> >>>>> index 77fb3ef..307f665 100644
> >>>>> --- a/memory.c
> >>>>> +++ b/memory.c
> >>>>> @@ -235,8 +235,6 @@ struct FlatView {
> >>>>>      MemoryRegion *root;
> >>>>>  };
> >>>>>  
> >>>>> -typedef struct AddressSpaceOps AddressSpaceOps;
> >>>>> -
> >>>>>  #define FOR_EACH_FLAT_RANGE(var, view)          \
> >>>>>      for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
> >>>>>  
> >>>>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as)
> >>>>>      memory_region_unref(as->root);
> >>>>>  }
> >>>>>  
> >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> >>>>> +{
> >>>>> +    if (!as->as_ops.iommu_get) {
> >>>>> +        return NULL;
> >>>>> +    }
> >>>>> +    return as->as_ops.iommu_get(as);
> >>>>> +}
> >>>>> +
> >>>>>  void address_space_destroy(AddressSpace *as)
> >>>>>  {
> >>>>>      MemoryRegion *root = as->root;
> >>>>
> >>>> -- 
> >>>> David Gibson			| I'll have my music baroque, and my code
> >>>> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> >>>> 				| _way_ _around_!
> >>>> http://www.ozlabs.org/~dgibson
> >>>
> >>>
> >>>
> >>
> > 
>
Peter Xu Nov. 15, 2017, 7:16 a.m. UTC | #12
On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:

[...]

> I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
> IOMMU MR and 1 AS per PCIe device, right?

I think this is the most tricky point - in QEMU IOMMU MR is not really
a 1:1 relationship to devices.  For Intel, it's true; for Power, it's
not.  On Power guests, one device's DMA address space can be splited
into different translation windows, while each window corresponds to
one IOMMU MR.

So IMHO the real 1:1 mapping is between the device and its DMA address
space, rather than MRs.

It's been a long time since when I drafted the patches.  I think at
least that should be a more general notifier mechanism comparing to
current IOMMUNotifier thing, which was bound to IOTLB notifies only.
AFAICT if we want to trap first-level translation changes, current
notifier is not even close to that interface - just see the definition
of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation
addresses, not anything else.  And IMHO that's why it's tightly bound
to MemoryRegions, and that's the root problem.  The dynamic IOMMU MR
switching problem is related to this issue as well.

I am not sure current "get IOMMU object from address space" solution
would be best, maybe it's "too bigger a scope", I think it depends on
whether in the future we'll have some requirement in such a bigger
scope (say, something we want to trap from vIOMMU and deliver it to
host IOMMU which may not even be device-related?  I don't know).  Now
another alternative I am thinking is, whether we can provide a
per-device notifier, then it can be bound to PCIDevice rather than
MemoryRegions, then it will be in device scope.

Thanks,
Liu, Yi L Nov. 16, 2017, 8:57 a.m. UTC | #13
Hi David,

On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > From: Peter Xu <peterx@redhat.com>
> > > > 
> > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > spaces to store arch-specific hooks.
> > > > 
> > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > IOMMUObject behind the AddressSpace.
> > > > 
> > > > For systems that have IOMMUs, we will create a special address
> > > > space per device which is different from system default address
> > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > Normally when that happens, there will be one specific IOMMU (or
> > > > say, translation unit) stands right behind that new address space.
> > > > 
> > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > on MemoryRegion.
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > 
> > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > discussion in China.  I've been sick several times and very busy.
> > > 
> > > I still don't feel like there's an adequate explanation of exactly
> > > what an IOMMUObject represents.   Obviously it can represent more than
> > > a single translation window - since that's represented by the
> > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > are represented by the IOMMUObject have in common, from a functional
> > > point of view.
> > > 
> > > Even understanding the SVM stuff better than I did, I don't really see
> > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > associated with it.
> > 
> > Here's what I thought about it: IOMMUObject was planned to be the
> > abstraction of the hardware translation unit, which is a higher level
> > of the translated address spaces.  Say, for each PCI device, it can
> > have its own translated address space.  However for multiple PCI
> > devices, they can be sharing the same translation unit that handles
> > the translation requests from different devices.  That's the case for
> > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > want to do something with that translation unit rather than a specific
> > device, in which we need a general IOMMU device handle.
> 
> Ok, but what does "hardware translation unit" mean in practice.  The
> guest neither knows nor cares, which bits of IOMMU translation happen
> to be included in the same bundle of silicon.  It only cares what the
> behaviour is.  What behavioural characteristics does a single
> IOMMUObject have?
> 
> > IIRC one issue left over during last time's discussion was that there
> > could be more complicated IOMMU models. E.g., one device's DMA request
> > can be translated nestedly by two or multiple IOMMUs, and current
> > proposal cannot really handle that complicated hierachy.  I'm just
> > thinking whether we can start from a simple model (say, we don't allow
> > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > far), then we can evolve from that point in the future.
> > 
> > Also, I thought there were something you mentioned that this approach
> > is not correct for Power systems, but I can't really remember the
> > details...  Anyways, I think this is not the only approach to solve
> > the problem, and I believe any new better idea would be greatly
> > welcomed as well. :)
> 
> So, some of my initial comments were based on a misunderstanding of
> what was proposed here - since discussing this with Yi at LinuxCon
> Beijing, I have a better idea of what's going on.
> 
> On POWER - or rather the "pseries" platform, which is paravirtualized.
> We can have multiple vIOMMU windows (usually 2) for a single virtual

On POWER, the DMA isolation is done by allocating different DMA window
to different isolation domains? And a single isolation domain may include
multiple dma windows? So with or withou IOMMU, there is only a single
DMA address shared by all the devices in the system? The isolation 
mechanism is as what described above?

> PCI host bridge.  Because of the paravirtualization, the mapping to
> hardware is fuzzy, but for passthrough devices they will both be
> implemented by the IOMMU built into the physical host bridge.  That
> isn't importat to the guest, though - all operations happen at the
> window level.

On VT-d, with IOMMU presented, each isolation domain has its own address
space. That's why we talked more on address space level. And iommu makes
the difference. That's the behavioural characteristics a single iommu
translation unit has. And thus an IOMMUObject going to have.

> 
> The other thing that bothers me here is the way it's attached to an
> AddressSpace.

My consideration is iommu handles AddressSpaces. dma address space is also
an address space managed by iommu. That's why we believe it is fine to
associate dma address space with an IOMMUObject.

>  IIUC how SVM works, the whole point is that the device
> no longer writes into a specific PCI address space.  Instead, it
> writes directly into a process address space.  So it seems to me more
> that SVM should operate at the PCI level, and disassociate the device
> from the normal PCI address space entirely, rather than hooking up
> something via that address space.

As Peter replied, we still need the PCI address space, it would be used
to build up the 2nd level page table which would be used in nested
translation.

Thanks,
Yi L

> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson Dec. 18, 2017, 5:41 a.m. UTC | #14
Sorry I've taken so long to reply, I've been super busy with other
things.

On Tue, Nov 14, 2017 at 11:31:00AM +0800, Peter Xu wrote:
> On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > 
> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > > spaces to store arch-specific hooks.
> > > > > 
> > > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > > IOMMUObject behind the AddressSpace.
> > > > > 
> > > > > For systems that have IOMMUs, we will create a special address
> > > > > space per device which is different from system default address
> > > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > > Normally when that happens, there will be one specific IOMMU (or
> > > > > say, translation unit) stands right behind that new address space.
> > > > > 
> > > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > > on MemoryRegion.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > > 
> > > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > > discussion in China.  I've been sick several times and very busy.
> > > > 
> > > > I still don't feel like there's an adequate explanation of exactly
> > > > what an IOMMUObject represents.   Obviously it can represent more than
> > > > a single translation window - since that's represented by the
> > > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > > are represented by the IOMMUObject have in common, from a functional
> > > > point of view.
> > > > 
> > > > Even understanding the SVM stuff better than I did, I don't really see
> > > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > > associated with it.
> > > 
> > > Here's what I thought about it: IOMMUObject was planned to be the
> > > abstraction of the hardware translation unit, which is a higher level
> > > of the translated address spaces.  Say, for each PCI device, it can
> > > have its own translated address space.  However for multiple PCI
> > > devices, they can be sharing the same translation unit that handles
> > > the translation requests from different devices.  That's the case for
> > > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > > want to do something with that translation unit rather than a specific
> > > device, in which we need a general IOMMU device handle.
> > 
> > Ok, but what does "hardware translation unit" mean in practice.  The
> > guest neither knows nor cares, which bits of IOMMU translation happen
> > to be included in the same bundle of silicon.  It only cares what the
> > behaviour is.  What behavioural characteristics does a single
> > IOMMUObject have?
> 
> In VT-d (I believe the same to ARM SMMUs), IMHO the special thing is
> that the translation windows (and device address spaces in QEMU) are
> only talking about second level translations, but not first level,
> while virt-svm needs to play with first level translations.  Until
> now, AFAIU we don't really have a good interface for first level
> translations at all (aka. the process address space).

Ok, that explains why you need some kind of different model than the
existing one, but that wasn't really disputed.  It still doesn't say
why separating the unclearly defined IOMMUObject from the IOMMU MR is
a good way of modelling this.

If the issue is 1st vs. 2nd level translation, it really seems you
should be explicitly modelling the different 1st level vs. 2nd level
address spaces, rather than just splitting functions between the MR
and AS level with no clear rationale behind what goes where.

> > > IIRC one issue left over during last time's discussion was that there
> > > could be more complicated IOMMU models. E.g., one device's DMA request
> > > can be translated nestedly by two or multiple IOMMUs, and current
> > > proposal cannot really handle that complicated hierachy.  I'm just
> > > thinking whether we can start from a simple model (say, we don't allow
> > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > > far), then we can evolve from that point in the future.
> > > 
> > > Also, I thought there were something you mentioned that this approach
> > > is not correct for Power systems, but I can't really remember the
> > > details...  Anyways, I think this is not the only approach to solve
> > > the problem, and I believe any new better idea would be greatly
> > > welcomed as well. :)
> > 
> > So, some of my initial comments were based on a misunderstanding of
> > what was proposed here - since discussing this with Yi at LinuxCon
> > Beijing, I have a better idea of what's going on.
> > 
> > On POWER - or rather the "pseries" platform, which is paravirtualized.
> > We can have multiple vIOMMU windows (usually 2) for a single virtual
> > PCI host bridge.  Because of the paravirtualization, the mapping to
> > hardware is fuzzy, but for passthrough devices they will both be
> > implemented by the IOMMU built into the physical host bridge.  That
> > isn't importat to the guest, though - all operations happen at the
> > window level.
> 
> Now I know that for Power it may not have anything like a "translation
> unit" but everything is defined as "translation windows" in the
> guests.  However the problem still exist for some other platforms.
> Say, for Intel we have emulated VT-d; for ARM, we have vSMMU.  AFAIU
> these platforms do have their translation units, and even for ARM it
> should need such an interface (or any better interfaces, which are
> always welcomed) for virt-svm to work.  Otherwise I don't know a way
> to configure the first level translation tables.
> 
> Meanwhile, IMO this abstraction should not really affect pseries - it
> should be only useful for those platforms who would like to use it.
> For pseries, we can just ignore that new interface if we don't really
> even have such a translation unit.

But it's _still_ not clear to me what a "translation unit" means.
What is common across a translation unit that is not common across
different translation units?

> 
> > 
> > The other thing that bothers me here is the way it's attached to an
> > AddressSpace.  IIUC how SVM works, the whole point is that the device
> > no longer writes into a specific PCI address space.  Instead, it
> > writes directly into a process address space.  So it seems to me more
> > that SVM should operate at the PCI level, and disassociate the device
> > from the normal PCI address space entirely, rather than hooking up
> > something via that address space.
> 
> IMO the PCI address space is still used.  For virt-svm, host IOMMU
> will be working in nested translation mode, so we should be having two
> mappings working in parallel:
> 
>   1. DPDK process (in guest) address space mapping (GVA -> GPA)
>   2. guest direct memory mappings (GPA -> HPA)
> 
> And here AFAIU the 2nd mapping is working exactly like general PCI
> devices, the only difference is that the 2nd level mapping is always
> static, just like when IOMMU passthrough is enabled for that device.

Ok.  IIUC the 2nd level mapping isn't visible to the guest, is that
right?

But this doesn't really affect my point - from the guest's point of
view, the "usual" address space for a PCI device is equal to the GPA
(i.e. no guest visible translation), but for SVM devices they instead
use an SVM address space depending on the process id, which is not the
same as the GPA space.

> So, IMHO virt-SVM is not really in parallel with PCI subsystem.

Well, it's not clear to me if it's in parallel to, or on top of.  But
the crucial point is that an SVM device does not access the "main" PCI
address space (whether or not that has a traditional IOMMU).  Instead
it has access to a bunch of different address spaces, one for each
process ID.

> For
> the SVM in guest, it may be different, since it should only be using
> first level translations.  However to implement virt-SVM, IMHO we not
> only need existing PCI address space translation logic, we also need
> an extra way to configure the first level mappings, as discussed.

Right.  I'm not disputing that a way to configure those first level
mappings is necessary.  The proposed model just doesn't seem a good
fit.  It still only represents a single "PCI address space", then
attaches things about the process table mapping to that, when in fact
they affect the process table and therefore *different* address spaces
from that which the methods are attached to.
David Gibson Dec. 18, 2017, 6:14 a.m. UTC | #15
On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
> Hi David,
> 
> On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > 
> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > > spaces to store arch-specific hooks.
> > > > > 
> > > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > > IOMMUObject behind the AddressSpace.
> > > > > 
> > > > > For systems that have IOMMUs, we will create a special address
> > > > > space per device which is different from system default address
> > > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > > Normally when that happens, there will be one specific IOMMU (or
> > > > > say, translation unit) stands right behind that new address space.
> > > > > 
> > > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > > on MemoryRegion.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > > 
> > > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > > discussion in China.  I've been sick several times and very busy.
> > > > 
> > > > I still don't feel like there's an adequate explanation of exactly
> > > > what an IOMMUObject represents.   Obviously it can represent more than
> > > > a single translation window - since that's represented by the
> > > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > > are represented by the IOMMUObject have in common, from a functional
> > > > point of view.
> > > > 
> > > > Even understanding the SVM stuff better than I did, I don't really see
> > > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > > associated with it.
> > > 
> > > Here's what I thought about it: IOMMUObject was planned to be the
> > > abstraction of the hardware translation unit, which is a higher level
> > > of the translated address spaces.  Say, for each PCI device, it can
> > > have its own translated address space.  However for multiple PCI
> > > devices, they can be sharing the same translation unit that handles
> > > the translation requests from different devices.  That's the case for
> > > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > > want to do something with that translation unit rather than a specific
> > > device, in which we need a general IOMMU device handle.
> > 
> > Ok, but what does "hardware translation unit" mean in practice.  The
> > guest neither knows nor cares, which bits of IOMMU translation happen
> > to be included in the same bundle of silicon.  It only cares what the
> > behaviour is.  What behavioural characteristics does a single
> > IOMMUObject have?
> > 
> > > IIRC one issue left over during last time's discussion was that there
> > > could be more complicated IOMMU models. E.g., one device's DMA request
> > > can be translated nestedly by two or multiple IOMMUs, and current
> > > proposal cannot really handle that complicated hierachy.  I'm just
> > > thinking whether we can start from a simple model (say, we don't allow
> > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > > far), then we can evolve from that point in the future.
> > > 
> > > Also, I thought there were something you mentioned that this approach
> > > is not correct for Power systems, but I can't really remember the
> > > details...  Anyways, I think this is not the only approach to solve
> > > the problem, and I believe any new better idea would be greatly
> > > welcomed as well. :)
> > 
> > So, some of my initial comments were based on a misunderstanding of
> > what was proposed here - since discussing this with Yi at LinuxCon
> > Beijing, I have a better idea of what's going on.
> > 
> > On POWER - or rather the "pseries" platform, which is paravirtualized.
> > We can have multiple vIOMMU windows (usually 2) for a single virtual
> 
> On POWER, the DMA isolation is done by allocating different DMA window
> to different isolation domains? And a single isolation domain may include
> multiple dma windows? So with or withou IOMMU, there is only a single
> DMA address shared by all the devices in the system? The isolation 
> mechanism is as what described above?

No, the multiple windows are completely unrelated to how things are
isolated.

Just like on x86, each IOMMU domain has independent IOMMU mappings.
The only difference is that IBM calls the domains "partitionable
endpoints" (PEs) and they tend to be statically created at boot time,
rather than runtime generated.

The windows are about what addresses in PCI space are translated by
the IOMMU.  If the device generates a PCI cycle, only certain
addresses will be mapped by the IOMMU to DMA - other addresses will
correspond to other devices MMIOs, MSI vectors, maybe other things.

The set of addresses translated by the IOMMU need not be contiguous.
Or, there could be two IOMMUs on the bus, each accepting different
address ranges.  These two situations are not distinguishable from the
guest's point of view.

So for a typical PAPR setup, the device can access system RAM either
via DMA in the range 0..1GiB (the "32-bit window") or in the range
2^59..2^59+<something> (the "64-bit window").  Typically the 32-bit
window has mappings dynamically created by drivers, and the 64-bit
window has all of system RAM mapped 1:1, but that's entirely up to the
OS, it can map each window however it wants.

32-bit devices (or "64 bit" devices which don't actually implement
enough the address bits) will only be able to use the 32-bit window,
of course.

MMIOs of other devices, the "magic" MSI-X addresses belonging to the
host bridge and other things exist outside those ranges.  Those are
just the ranges which are used to DMA to RAM.

Each PE (domain) can see a different version of what's in each
window.

In fact, if I understand the "IO hole" correctly, the situation on x86
isn't very different.  It has a window below the IO hole and a second
window above the IO hole.  The addresses within the IO hole go to
(32-bit) devices on the PCI bus, rather than being translated by the
IOMMU to RAM addresses.  Because the gap is smaller between the two
windows, I think we get away without really modelling this detail in
qemu though.

> > PCI host bridge.  Because of the paravirtualization, the mapping to
> > hardware is fuzzy, but for passthrough devices they will both be
> > implemented by the IOMMU built into the physical host bridge.  That
> > isn't importat to the guest, though - all operations happen at the
> > window level.
> 
> On VT-d, with IOMMU presented, each isolation domain has its own address
> space. That's why we talked more on address space level. And iommu makes
> the difference. That's the behavioural characteristics a single iommu
> translation unit has. And thus an IOMMUObject going to have.

Right, that's the same on POWER.  But the IOMMU only translates *some*
addresses within the address space, not all of them.  The rest will go
to other PCI devices or be unmapped, but won't go to RAM.

That's why the IOMMU should really be associated with an MR (or
several MRs), not an AddressSpace, it only translates some addresses.

> > The other thing that bothers me here is the way it's attached to an
> > AddressSpace.
> 
> My consideration is iommu handles AddressSpaces. dma address space is also
> an address space managed by iommu.

No, it's not.  It's a region (or several) within the overall PCI
address space.  Other things in the addressspace, such as other
device's BARs exist independent of the IOMMU.

It's not something that could really work with PCI-E, I think, but
with a more traditional PCI bus there's no reason you couldn't have
multiple IOMMUs listening on different regions of the PCI address
space.

> That's why we believe it is fine to
> associate dma address space with an IOMMUObject.

> >  IIUC how SVM works, the whole point is that the device
> > no longer writes into a specific PCI address space.  Instead, it
> > writes directly into a process address space.  So it seems to me more
> > that SVM should operate at the PCI level, and disassociate the device
> > from the normal PCI address space entirely, rather than hooking up
> > something via that address space.
> 
> As Peter replied, we still need the PCI address space, it would be used
> to build up the 2nd level page table which would be used in nested
> translation.
> 
> Thanks,
> Yi L
> 
> > 
>
David Gibson Dec. 18, 2017, 6:30 a.m. UTC | #16
On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote:
> Hi Yi L,
> 
> On 13/11/2017 10:58, Liu, Yi L wrote:
> > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> >> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> >>> From: Peter Xu <peterx@redhat.com>
> >>>
> >>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> >>> spaces to store arch-specific hooks.
> >>>
> >>> The first hook I would like to introduce is iommu_get(). Return an
> >>> IOMMUObject behind the AddressSpace.
> >>>
> >>> For systems that have IOMMUs, we will create a special address
> >>> space per device which is different from system default address
> >>> space for it (please refer to pci_device_iommu_address_space()).
> >>> Normally when that happens, there will be one specific IOMMU (or
> >>> say, translation unit) stands right behind that new address space.
> >>>
> >>> This iommu_get() fetches that guy behind the address space. Here,
> >>> the guy is defined as IOMMUObject, which includes a notifier_list
> >>> so far, may extend in future. Along with IOMMUObject, a new iommu
> >>> notifier mechanism is introduced. It would be used for virt-svm.
> >>> Also IOMMUObject can further have a IOMMUObjectOps which is similar
> >>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> >>> on MemoryRegion.
> >>>
> >>> Signed-off-by: Peter Xu <peterx@redhat.com>
> >>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> >>
> >> Hi, sorry I didn't reply to the earlier postings of this after our
> >> discussion in China.  I've been sick several times and very busy.
> > 
> > Hi David,
> > 
> > Fully understood. I'll try my best to address your question. Also,
> > feel free to input further questions, anyhow, the more we discuss the
> > better work we done.
> > 
> >> I still don't feel like there's an adequate explanation of exactly
> >> what an IOMMUObject represents.   Obviously it can represent more than
> > 
> > IOMMUObject is aimed to represent the iommu itself. e.g. the iommu
> > specific operations. One of the key purpose of IOMMUObject is to
> > introduce a notifier framework to let iommu emulator to be able to
> > do iommu operations other than MAP/UNMAP. As IOMMU grows more and
> > more feature, MAP/UNMAP is not the only operation iommu emulator needs
> > to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also
> > has it. may correct me on it. As my cover letter mentioned, MR based
> > notifier framework doesn’t work for the newly added IOMMU operations.
> > Like bind guest pasid table pointer to host and propagate guest's
> > iotlb flush to host.
> > 
> >> a single translation window - since that's represented by the
> >> IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> >> are represented by the IOMMUObject have in common, from a functional
> >> point of view.
> > 
> > Let me take virt-SVM as an example. As far as I know, for virt-SVM,
> > the implementation of different vendors are similar. The key design
> > is to have a nested translation(aka. two stage translation). It is to
> > have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA
> > mapping. Similar to EPT based virt-MMU solution.
> > 
> > In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can
> > keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor
> > needs to trap specific guest iommu operation and pass the gVA->gPA
> > mapping knowledge to host through a notifier(newly added one). In VT-d,
> > it is called bind guest pasid table to host.
> 
> What I don't get is the PASID table is per extended context entry. I
> understand this latter is indexed by PCI device function. And today MR
> are created per PCIe device if I am not wrong. So why can't we have 1
> new MR notifier dedicated to PASID table passing? My understanding is
> the MR, having a 1-1 correspondence with a PCIe device and thus a
> context could be of right granularity.

Not really.  The MR(s) and AS is created per a group of devices which
will always see the same mappings.  On Intel that's the IOMMU domain.
On PAPR that's a partitionable-endpoint - except that we choose to
only have one PE per guest host bridge (but multiple host bridges is
standard for POWER).

There's a qemu hook to get the right AS for a device, which takes the
devfn as a parameter.  Depending on the host bridge implementation,
though, it won't necessary return a different AS for every device
though.

> Then I understand the only flags
> we currently have are NONE, MAP and UNMAP but couldn't we add a new one
> for PASID TABLE passing? So this is not crystal clear to me why MR
> notifiers are not adapted to PASID table passing.

Right, to me either.  Things get more complicated if both the 1st
level (per PASID) and 2nd level translations (per PCI RID) are visible
to the guest.  Having level 1 owned by the guest and 2nd level owned
by the host is the typical mode of operation, but if we want to model
bare metal machines we do need to handle the case of both.  Similarly,
implementing virt-SVM can't go and break our modelling of
"traditional" non-PASID aware IOMMUs.  Those are not usually present
in x86 guests, although they can be, and they are *always* present for
PAPR guests.

> > Also, for the gVA iotlb flushing, only guest knows it. So hypervisor
> > needs to propagate it to host. Here, MAP/UNMAP is not suitable since
> > this gVA iotlb flush here doesn’t require to modify host iommu
> > translation table.
> I don't really get this argument. IOMMUNotifier just is a notifier that
> is attached to an IOMMU MR and calls a an IOMMUNotify function, right?
> Then the role of the function currently is attached to the currently
> existing flags, MAP, UNMAP. This is not linked to an action on the
> physical IOMMU, right?

Maybe, maybe not.  In the case of emulated devices, it need not touch
the host MMU.  However, for the case of VFIO devices, we need to
mirror mappings in the guest IOMMU to the host IOMMU.
Liu, Yi L Dec. 18, 2017, 9:17 a.m. UTC | #17
On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote:
> On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
> > Hi David,
> > 
> > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > > > From: Peter Xu <peterx@redhat.com>
> > > > > > 
> > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > > > spaces to store arch-specific hooks.
> > > > > > 
> > > > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > > > IOMMUObject behind the AddressSpace.
> > > > > > 
> > > > > > For systems that have IOMMUs, we will create a special address
> > > > > > space per device which is different from system default address
> > > > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > > > Normally when that happens, there will be one specific IOMMU (or
> > > > > > say, translation unit) stands right behind that new address space.
> > > > > > 
> > > > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > > > on MemoryRegion.
> > > > > > 
> > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > > > 
> > > > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > > > discussion in China.  I've been sick several times and very busy.
> > > > > 
> > > > > I still don't feel like there's an adequate explanation of exactly
> > > > > what an IOMMUObject represents.   Obviously it can represent more than
> > > > > a single translation window - since that's represented by the
> > > > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > > > are represented by the IOMMUObject have in common, from a functional
> > > > > point of view.
> > > > > 
> > > > > Even understanding the SVM stuff better than I did, I don't really see
> > > > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > > > associated with it.
> > > > 
> > > > Here's what I thought about it: IOMMUObject was planned to be the
> > > > abstraction of the hardware translation unit, which is a higher level
> > > > of the translated address spaces.  Say, for each PCI device, it can
> > > > have its own translated address space.  However for multiple PCI
> > > > devices, they can be sharing the same translation unit that handles
> > > > the translation requests from different devices.  That's the case for
> > > > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > > > want to do something with that translation unit rather than a specific
> > > > device, in which we need a general IOMMU device handle.
> > > 
> > > Ok, but what does "hardware translation unit" mean in practice.  The
> > > guest neither knows nor cares, which bits of IOMMU translation happen
> > > to be included in the same bundle of silicon.  It only cares what the
> > > behaviour is.  What behavioural characteristics does a single
> > > IOMMUObject have?
> > > 
> > > > IIRC one issue left over during last time's discussion was that there
> > > > could be more complicated IOMMU models. E.g., one device's DMA request
> > > > can be translated nestedly by two or multiple IOMMUs, and current
> > > > proposal cannot really handle that complicated hierachy.  I'm just
> > > > thinking whether we can start from a simple model (say, we don't allow
> > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > > > far), then we can evolve from that point in the future.
> > > > 
> > > > Also, I thought there were something you mentioned that this approach
> > > > is not correct for Power systems, but I can't really remember the
> > > > details...  Anyways, I think this is not the only approach to solve
> > > > the problem, and I believe any new better idea would be greatly
> > > > welcomed as well. :)
> > > 
> > > So, some of my initial comments were based on a misunderstanding of
> > > what was proposed here - since discussing this with Yi at LinuxCon
> > > Beijing, I have a better idea of what's going on.
> > > 
> > > On POWER - or rather the "pseries" platform, which is paravirtualized.
> > > We can have multiple vIOMMU windows (usually 2) for a single virtual
> > 
> > On POWER, the DMA isolation is done by allocating different DMA window
> > to different isolation domains? And a single isolation domain may include
> > multiple dma windows? So with or withou IOMMU, there is only a single
> > DMA address shared by all the devices in the system? The isolation 
> > mechanism is as what described above?
> 
> No, the multiple windows are completely unrelated to how things are
> isolated.

I'm afraid I chose a wrong word by using "DMA window"..
Actually, when mentioning "DMA window", I mean address ranges in an iova
address space. Anyhow, let me re-shape my understanding of POWER IOMMU and
make sure we are in the same page.

> 
> Just like on x86, each IOMMU domain has independent IOMMU mappings.
> The only difference is that IBM calls the domains "partitionable
> endpoints" (PEs) and they tend to be statically created at boot time,
> rather than runtime generated.

Does POWER IOMMU also have iova concept? Device can use an iova to
access memory, and IOMMU translates the iova to an address within the
system physical address?

> 
> The windows are about what addresses in PCI space are translated by
> the IOMMU.  If the device generates a PCI cycle, only certain
> addresses will be mapped by the IOMMU to DMA - other addresses will
> correspond to other devices MMIOs, MSI vectors, maybe other things.

I guess the windows you mentioned here is the address ranges within the
system physical address space as you also mentioned MMIOs and etc.

> The set of addresses translated by the IOMMU need not be contiguous.

I suppose you mean the output addresses of the IOMMU need not be
contiguous?

> Or, there could be two IOMMUs on the bus, each accepting different
> address ranges.  These two situations are not distinguishable from the
> guest's point of view.
> 
> So for a typical PAPR setup, the device can access system RAM either
> via DMA in the range 0..1GiB (the "32-bit window") or in the range
> 2^59..2^59+<something> (the "64-bit window").  Typically the 32-bit
> window has mappings dynamically created by drivers, and the 64-bit
> window has all of system RAM mapped 1:1, but that's entirely up to the
> OS, it can map each window however it wants.
> 
> 32-bit devices (or "64 bit" devices which don't actually implement
> enough the address bits) will only be able to use the 32-bit window,
> of course.
> 
> MMIOs of other devices, the "magic" MSI-X addresses belonging to the
> host bridge and other things exist outside those ranges.  Those are
> just the ranges which are used to DMA to RAM.
> 
> Each PE (domain) can see a different version of what's in each
> window.

If I'm correct so far. PE actually defines a mapping between an address
range of an address space(aka. iova address space) and an address range
of the system physical address space.

Then my question is: does each PE define a separate iova address sapce
which is flat from 0 - 2^AW -1, AW is address width? As a reference,
VT-d domain defines a flat address space for each domain.

> 
> In fact, if I understand the "IO hole" correctly, the situation on x86
> isn't very different.  It has a window below the IO hole and a second
> window above the IO hole.  The addresses within the IO hole go to
> (32-bit) devices on the PCI bus, rather than being translated by the

If you mean the "IO hole" within the system physcial address space, I think
it's yes.

> IOMMU to RAM addresses.  Because the gap is smaller between the two
> windows, I think we get away without really modelling this detail in
> qemu though.
> 
> > > PCI host bridge.  Because of the paravirtualization, the mapping to
> > > hardware is fuzzy, but for passthrough devices they will both be
> > > implemented by the IOMMU built into the physical host bridge.  That
> > > isn't importat to the guest, though - all operations happen at the
> > > window level.
> > 
> > On VT-d, with IOMMU presented, each isolation domain has its own address
> > space. That's why we talked more on address space level. And iommu makes
> > the difference. That's the behavioural characteristics a single iommu
> > translation unit has. And thus an IOMMUObject going to have.
> 
> Right, that's the same on POWER.  But the IOMMU only translates *some*
> addresses within the address space, not all of them.  The rest will go
> to other PCI devices or be unmapped, but won't go to RAM.
> 
> That's why the IOMMU should really be associated with an MR (or
> several MRs), not an AddressSpace, it only translates some addresses.

If I'm correct so far, I do believe the major difference between VT-d and
POWER IOMMU is that VT-d isolation domain is a flat address space while
PE of POWER is something different(need your input here as I'm not sure about
it). Maybe it's like there is a flat address space, each PE takes some address
ranges and maps the address ranges to different system physcial address ranges.

> 
> > > The other thing that bothers me here is the way it's attached to an
> > > AddressSpace.
> > 
> > My consideration is iommu handles AddressSpaces. dma address space is also
> > an address space managed by iommu.
> 
> No, it's not.  It's a region (or several) within the overall PCI
> address space.  Other things in the addressspace, such as other
> device's BARs exist independent of the IOMMU.
> 
> It's not something that could really work with PCI-E, I think, but
> with a more traditional PCI bus there's no reason you couldn't have
> multiple IOMMUs listening on different regions of the PCI address
> space.

I think the point here is on POWER, the input addresses of IOMMUs are actaully
in the same address space? What IOMMU does is mapping the different ranges to
different system physcial address ranges. So it's as you mentioned, multiple
IOMMUs listen on different regions of a PCI address space.

While for VT-d, it's not the case. The input addresses of IOMMUs may not
in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a
separate address space. So for VT-d, IOMMUs are actually listening to different
address spaces. That's the point why we want to have address space level
abstraction of IOMMU.

> 
> > That's why we believe it is fine to
> > associate dma address space with an IOMMUObject.
> 
> > >  IIUC how SVM works, the whole point is that the device
> > > no longer writes into a specific PCI address space.  Instead, it
> > > writes directly into a process address space.  So it seems to me more
> > > that SVM should operate at the PCI level, and disassociate the device
> > > from the normal PCI address space entirely, rather than hooking up
> > > something via that address space.
> > 
> > As Peter replied, we still need the PCI address space, it would be used
> > to build up the 2nd level page table which would be used in nested
> > translation.
> > 
> > Thanks,
> > Yi L
> > 
> > > 
> > 
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

Regards,
Yi L
David Gibson Dec. 18, 2017, 11:22 a.m. UTC | #18
On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote:
> On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote:
> > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
> > > Hi David,
> > > 
> > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> > > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > > > > From: Peter Xu <peterx@redhat.com>
> > > > > > > 
> > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > > > > spaces to store arch-specific hooks.
> > > > > > > 
> > > > > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > > > > IOMMUObject behind the AddressSpace.
> > > > > > > 
> > > > > > > For systems that have IOMMUs, we will create a special address
> > > > > > > space per device which is different from system default address
> > > > > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > > > > Normally when that happens, there will be one specific IOMMU (or
> > > > > > > say, translation unit) stands right behind that new address space.
> > > > > > > 
> > > > > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > > > > on MemoryRegion.
> > > > > > > 
> > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > > > > 
> > > > > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > > > > discussion in China.  I've been sick several times and very busy.
> > > > > > 
> > > > > > I still don't feel like there's an adequate explanation of exactly
> > > > > > what an IOMMUObject represents.   Obviously it can represent more than
> > > > > > a single translation window - since that's represented by the
> > > > > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > > > > are represented by the IOMMUObject have in common, from a functional
> > > > > > point of view.
> > > > > > 
> > > > > > Even understanding the SVM stuff better than I did, I don't really see
> > > > > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > > > > associated with it.
> > > > > 
> > > > > Here's what I thought about it: IOMMUObject was planned to be the
> > > > > abstraction of the hardware translation unit, which is a higher level
> > > > > of the translated address spaces.  Say, for each PCI device, it can
> > > > > have its own translated address space.  However for multiple PCI
> > > > > devices, they can be sharing the same translation unit that handles
> > > > > the translation requests from different devices.  That's the case for
> > > > > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > > > > want to do something with that translation unit rather than a specific
> > > > > device, in which we need a general IOMMU device handle.
> > > > 
> > > > Ok, but what does "hardware translation unit" mean in practice.  The
> > > > guest neither knows nor cares, which bits of IOMMU translation happen
> > > > to be included in the same bundle of silicon.  It only cares what the
> > > > behaviour is.  What behavioural characteristics does a single
> > > > IOMMUObject have?
> > > > 
> > > > > IIRC one issue left over during last time's discussion was that there
> > > > > could be more complicated IOMMU models. E.g., one device's DMA request
> > > > > can be translated nestedly by two or multiple IOMMUs, and current
> > > > > proposal cannot really handle that complicated hierachy.  I'm just
> > > > > thinking whether we can start from a simple model (say, we don't allow
> > > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > > > > far), then we can evolve from that point in the future.
> > > > > 
> > > > > Also, I thought there were something you mentioned that this approach
> > > > > is not correct for Power systems, but I can't really remember the
> > > > > details...  Anyways, I think this is not the only approach to solve
> > > > > the problem, and I believe any new better idea would be greatly
> > > > > welcomed as well. :)
> > > > 
> > > > So, some of my initial comments were based on a misunderstanding of
> > > > what was proposed here - since discussing this with Yi at LinuxCon
> > > > Beijing, I have a better idea of what's going on.
> > > > 
> > > > On POWER - or rather the "pseries" platform, which is paravirtualized.
> > > > We can have multiple vIOMMU windows (usually 2) for a single virtual
> > > 
> > > On POWER, the DMA isolation is done by allocating different DMA window
> > > to different isolation domains? And a single isolation domain may include
> > > multiple dma windows? So with or withou IOMMU, there is only a single
> > > DMA address shared by all the devices in the system? The isolation 
> > > mechanism is as what described above?
> > 
> > No, the multiple windows are completely unrelated to how things are
> > isolated.
> 
> I'm afraid I chose a wrong word by using "DMA window"..
> Actually, when mentioning "DMA window", I mean address ranges in an iova
> address space.

Yes, so did I.  My one window I mean one contiguous range of IOVA addresses.

> Anyhow, let me re-shape my understanding of POWER IOMMU and
> make sure we are in the same page.
> 
> > 
> > Just like on x86, each IOMMU domain has independent IOMMU mappings.
> > The only difference is that IBM calls the domains "partitionable
> > endpoints" (PEs) and they tend to be statically created at boot time,
> > rather than runtime generated.
> 
> Does POWER IOMMU also have iova concept? Device can use an iova to
> access memory, and IOMMU translates the iova to an address within the
> system physical address?

Yes.  When I say the "PCI address space" I mean the IOVA space.

> > The windows are about what addresses in PCI space are translated by
> > the IOMMU.  If the device generates a PCI cycle, only certain
> > addresses will be mapped by the IOMMU to DMA - other addresses will
> > correspond to other devices MMIOs, MSI vectors, maybe other things.
> 
> I guess the windows you mentioned here is the address ranges within the
> system physical address space as you also mentioned MMIOs and etc.

No.  I mean ranges within the PCI space == IOVA space. It's simplest
to understand with traditional PCI.  A cycle on the bus doesn't know
whether the destination is a device or memory, it just has an address
- a PCI memory address.  Part of that address range is mapped to
system RAM, optionally with an IOMMU translating it.  Other parts of
that address space are used for devices.

With PCI-E things get more complicated, but the conceptual model is
the same.

> > The set of addresses translated by the IOMMU need not be contiguous.
> 
> I suppose you mean the output addresses of the IOMMU need not be
> contiguous?

No.  I mean the input addresses of the IOMMU.

> > Or, there could be two IOMMUs on the bus, each accepting different
> > address ranges.  These two situations are not distinguishable from the
> > guest's point of view.
> > 
> > So for a typical PAPR setup, the device can access system RAM either
> > via DMA in the range 0..1GiB (the "32-bit window") or in the range
> > 2^59..2^59+<something> (the "64-bit window").  Typically the 32-bit
> > window has mappings dynamically created by drivers, and the 64-bit
> > window has all of system RAM mapped 1:1, but that's entirely up to the
> > OS, it can map each window however it wants.
> > 
> > 32-bit devices (or "64 bit" devices which don't actually implement
> > enough the address bits) will only be able to use the 32-bit window,
> > of course.
> > 
> > MMIOs of other devices, the "magic" MSI-X addresses belonging to the
> > host bridge and other things exist outside those ranges.  Those are
> > just the ranges which are used to DMA to RAM.
> > 
> > Each PE (domain) can see a different version of what's in each
> > window.
> 
> If I'm correct so far. PE actually defines a mapping between an address
> range of an address space(aka. iova address space) and an address range
> of the system physical address space.

No.  A PE means several things, but basically it is an isolation
domain, like an Intel IOMMU domain.  Each PE has an independent set of
IOMMU mappings which translate part of the PCI address space to system
memory space.

> Then my question is: does each PE define a separate iova address sapce
> which is flat from 0 - 2^AW -1, AW is address width? As a reference,
> VT-d domain defines a flat address space for each domain.

Partly.  Each PE has an address space which all devices in the PE see.
Only some of that address space is mapped to system memory though,
other parts are occupied by devices, others are unmapped.

Only the parts mapped by the IOMMU vary between PEs - the other parts
of the address space will be identical for all PEs on the host
bridge.  However for POWER guests (not for hosts) there is exactly one
PE for each virtual host bridge.

> > In fact, if I understand the "IO hole" correctly, the situation on x86
> > isn't very different.  It has a window below the IO hole and a second
> > window above the IO hole.  The addresses within the IO hole go to
> > (32-bit) devices on the PCI bus, rather than being translated by the
> 
> If you mean the "IO hole" within the system physcial address space, I think
> it's yes.

Well, really I mean the IO hole in PCI address space.  Because system
address space and PCI memory space were traditionally identity mapped
on x86 this is easy to confuse though.

> > IOMMU to RAM addresses.  Because the gap is smaller between the two
> > windows, I think we get away without really modelling this detail in
> > qemu though.
> > 
> > > > PCI host bridge.  Because of the paravirtualization, the mapping to
> > > > hardware is fuzzy, but for passthrough devices they will both be
> > > > implemented by the IOMMU built into the physical host bridge.  That
> > > > isn't importat to the guest, though - all operations happen at the
> > > > window level.
> > > 
> > > On VT-d, with IOMMU presented, each isolation domain has its own address
> > > space. That's why we talked more on address space level. And iommu makes
> > > the difference. That's the behavioural characteristics a single iommu
> > > translation unit has. And thus an IOMMUObject going to have.
> > 
> > Right, that's the same on POWER.  But the IOMMU only translates *some*
> > addresses within the address space, not all of them.  The rest will go
> > to other PCI devices or be unmapped, but won't go to RAM.
> > 
> > That's why the IOMMU should really be associated with an MR (or
> > several MRs), not an AddressSpace, it only translates some addresses.
> 
> If I'm correct so far, I do believe the major difference between VT-d and
> POWER IOMMU is that VT-d isolation domain is a flat address space while
> PE of POWER is something different(need your input here as I'm not sure about
> it). Maybe it's like there is a flat address space, each PE takes some address
> ranges and maps the address ranges to different system physcial address ranges.

No, it's really not that different.  In both cases (without virt-SVM)
there's a system memory address space, and a PCI address space for
each domain / PE.  There are one or more "outbound" windows in system
memory space that map system memory cycles to PCI cycles (used by the
CPU to access MMIO) and one or more "inbound" (DMA) windows in PCI
memory space which map PCI cycles onto system memory cycles (used by
devices to access system memory).

On old-style PCs, both inbound and outbound windows were (mostly)
identity maps.  On POWER they are not.

> > > > The other thing that bothers me here is the way it's attached to an
> > > > AddressSpace.
> > > 
> > > My consideration is iommu handles AddressSpaces. dma address space is also
> > > an address space managed by iommu.
> > 
> > No, it's not.  It's a region (or several) within the overall PCI
> > address space.  Other things in the addressspace, such as other
> > device's BARs exist independent of the IOMMU.
> > 
> > It's not something that could really work with PCI-E, I think, but
> > with a more traditional PCI bus there's no reason you couldn't have
> > multiple IOMMUs listening on different regions of the PCI address
> > space.
> 
> I think the point here is on POWER, the input addresses of IOMMUs are actaully
> in the same address space?

I'm not sure what you mean, but I don't think so.  Each PE has its own
IOMMU input address space.

> What IOMMU does is mapping the different ranges to
> different system physcial address ranges. So it's as you mentioned, multiple
> IOMMUs listen on different regions of a PCI address space.

No.  That could be the case in theory, but it's not the usual case.

Or rather it depends what you mean by "an IOMMU".  For PAPR guests,
both IOVA 0..1GiB and 2^59..(somewhere) are mapped to system memory,
but with separate page tables.  You could consider that two IOMMUs (we
mostly treat it that way in qemu).  However, all the mapping is
handled by the same host bridge with 2 sets of page tables per PE, so
you could also call it one IOMMU.

This is what I'm getting at when I say that "one IOMMU" is not a
clearly defined unit.

> While for VT-d, it's not the case. The input addresses of IOMMUs may not
> in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a
> separate address space. So for VT-d, IOMMUs are actually listening to different
> address spaces. That's the point why we want to have address space level
> abstraction of IOMMU.
> 
> > 
> > > That's why we believe it is fine to
> > > associate dma address space with an IOMMUObject.
> > 
> > > >  IIUC how SVM works, the whole point is that the device
> > > > no longer writes into a specific PCI address space.  Instead, it
> > > > writes directly into a process address space.  So it seems to me more
> > > > that SVM should operate at the PCI level, and disassociate the device
> > > > from the normal PCI address space entirely, rather than hooking up
> > > > something via that address space.
> > > 
> > > As Peter replied, we still need the PCI address space, it would be used
> > > to build up the 2nd level page table which would be used in nested
> > > translation.
> > > 
> > > Thanks,
> > > Yi L
> > > 
> > > > 
> > > 
> > 
> 
> Regards,
> Yi L
>
David Gibson Dec. 18, 2017, 11:35 a.m. UTC | #19
On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> 
> [...]
> 
> > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
> > IOMMU MR and 1 AS per PCIe device, right?
> 
> I think this is the most tricky point - in QEMU IOMMU MR is not really
> a 1:1 relationship to devices.  For Intel, it's true; for Power, it's
> not.  On Power guests, one device's DMA address space can be splited
> into different translation windows, while each window corresponds to
> one IOMMU MR.

Right.

> So IMHO the real 1:1 mapping is between the device and its DMA address
> space, rather than MRs.

That's not true either.  With both POWER and Intel, several devices
can share a DMA address space: on POWER if they are in the same PE, on
Intel if they are place in the same IOMMU domain.

On x86 and on POWER bare metal we generally try to make the minimum
granularity for each PE/domain be a single function.  However, that
may not be possible in the case of PCIe to PCI bridges, or
multifunction devices where the functions aren't properly isolated
from each other (e.g. function 0 debug registers which can affect
other functions are quite common).

For POWER guests we only have one PE/domain per virtual host bridge.
That's just a matter of implementation simplicity - if you want fine
grained isolation you can just create more virtual host bridges.

> It's been a long time since when I drafted the patches.  I think at
> least that should be a more general notifier mechanism comparing to
> current IOMMUNotifier thing, which was bound to IOTLB notifies only.
> AFAICT if we want to trap first-level translation changes, current
> notifier is not even close to that interface - just see the definition
> of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation
> addresses, not anything else.  And IMHO that's why it's tightly bound
> to MemoryRegions, and that's the root problem.  The dynamic IOMMU MR
> switching problem is related to this issue as well.

So, having read and thought a bunch more, I think I know where you
need to start hooking this in.  The thing is the current qemu PCI DMA
structure assumes that each device belongs to just a single PCI
address space - that's what pci_device_iommu_address_space() returns.

For virt-SVM that's just not true.  IIUC, a virt-SVM capable device
could simultaneously write to multiple process address spaces, since
the process IDs actually go over the bus.

So trying to hook notifiers at the AddressSpace OR MemoryRegion level
just doesn't make sense - if we've picked a single addresss space for
the device, we've already made a wrong step.

Instead what you need I think is something like:
pci_device_virtsvm_context().  virt-SVM capable devices would need to
call that *before* calling pci_device_iommu_address_space ().  Well
rather the virt-SVM capable DMA helpers would need to call that.

That would return a new VirtSVMContext (or something) object, which
would roughly correspond to a single PASID table.  That's where the
methods and notifiers for managing that would need to go.

> I am not sure current "get IOMMU object from address space" solution
> would be best, maybe it's "too bigger a scope", I think it depends on
> whether in the future we'll have some requirement in such a bigger
> scope (say, something we want to trap from vIOMMU and deliver it to
> host IOMMU which may not even be device-related?  I don't know).  Now
> another alternative I am thinking is, whether we can provide a
> per-device notifier, then it can be bound to PCIDevice rather than
> MemoryRegions, then it will be in device scope.

I think that sounds like a version of what I've suggested above.
David Gibson Dec. 18, 2017, 11:38 a.m. UTC | #20
On Tue, Nov 14, 2017 at 11:21:59AM +0100, Auger Eric wrote:
> Hi Yi L,
> 
> On 03/11/2017 13:01, Liu, Yi L wrote:
> > From: Peter Xu <peterx@redhat.com>
> > 
> > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > spaces to store arch-specific hooks.
> > 
> > The first hook I would like to introduce is iommu_get(). Return an
> > IOMMUObject behind the AddressSpace.
> 
> David had an objection in the past about this method, saying that
> several IOMMUs could translate a single AS?
> 
> https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html
> 
> On ARM I think it works in general:
> In
> https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt,
> it is said
> "a given PCI device can only master through one IOMMU"

That's using a platform specific meaning of what "one IOMMU" means.
In general what's several IOMMUs and what's one IOMMU which responds
to several address regions is not distinguishable from the device's
point of view.
Liu, Yi L Dec. 20, 2017, 6:32 a.m. UTC | #21
On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote:
> On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote:
> > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote:
> > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
> > > > Hi David,
> > > > 
> > > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote:
> > > > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote:
> > > > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote:
> > > > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote:
> > > > > > > > From: Peter Xu <peterx@redhat.com>
> > > > > > > > 
> > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address
> > > > > > > > spaces to store arch-specific hooks.
> > > > > > > > 
> > > > > > > > The first hook I would like to introduce is iommu_get(). Return an
> > > > > > > > IOMMUObject behind the AddressSpace.
> > > > > > > > 
> > > > > > > > For systems that have IOMMUs, we will create a special address
> > > > > > > > space per device which is different from system default address
> > > > > > > > space for it (please refer to pci_device_iommu_address_space()).
> > > > > > > > Normally when that happens, there will be one specific IOMMU (or
> > > > > > > > say, translation unit) stands right behind that new address space.
> > > > > > > > 
> > > > > > > > This iommu_get() fetches that guy behind the address space. Here,
> > > > > > > > the guy is defined as IOMMUObject, which includes a notifier_list
> > > > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu
> > > > > > > > notifier mechanism is introduced. It would be used for virt-svm.
> > > > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar
> > > > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied
> > > > > > > > on MemoryRegion.
> > > > > > > > 
> > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > > > > > > 
> > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our
> > > > > > > discussion in China.  I've been sick several times and very busy.
> > > > > > > 
> > > > > > > I still don't feel like there's an adequate explanation of exactly
> > > > > > > what an IOMMUObject represents.   Obviously it can represent more than
> > > > > > > a single translation window - since that's represented by the
> > > > > > > IOMMUMR.  But what exactly do all the MRs - or whatever else - that
> > > > > > > are represented by the IOMMUObject have in common, from a functional
> > > > > > > point of view.
> > > > > > > 
> > > > > > > Even understanding the SVM stuff better than I did, I don't really see
> > > > > > > why an AddressSpace is an obvious unit to have an IOMMUObject
> > > > > > > associated with it.
> > > > > > 
> > > > > > Here's what I thought about it: IOMMUObject was planned to be the
> > > > > > abstraction of the hardware translation unit, which is a higher level
> > > > > > of the translated address spaces.  Say, for each PCI device, it can
> > > > > > have its own translated address space.  However for multiple PCI
> > > > > > devices, they can be sharing the same translation unit that handles
> > > > > > the translation requests from different devices.  That's the case for
> > > > > > Intel platforms.  We introduced this IOMMUObject because sometimes we
> > > > > > want to do something with that translation unit rather than a specific
> > > > > > device, in which we need a general IOMMU device handle.
> > > > > 
> > > > > Ok, but what does "hardware translation unit" mean in practice.  The
> > > > > guest neither knows nor cares, which bits of IOMMU translation happen
> > > > > to be included in the same bundle of silicon.  It only cares what the
> > > > > behaviour is.  What behavioural characteristics does a single
> > > > > IOMMUObject have?
> > > > > 
> > > > > > IIRC one issue left over during last time's discussion was that there
> > > > > > could be more complicated IOMMU models. E.g., one device's DMA request
> > > > > > can be translated nestedly by two or multiple IOMMUs, and current
> > > > > > proposal cannot really handle that complicated hierachy.  I'm just
> > > > > > thinking whether we can start from a simple model (say, we don't allow
> > > > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so
> > > > > > far), then we can evolve from that point in the future.
> > > > > > 
> > > > > > Also, I thought there were something you mentioned that this approach
> > > > > > is not correct for Power systems, but I can't really remember the
> > > > > > details...  Anyways, I think this is not the only approach to solve
> > > > > > the problem, and I believe any new better idea would be greatly
> > > > > > welcomed as well. :)
> > > > > 
> > > > > So, some of my initial comments were based on a misunderstanding of
> > > > > what was proposed here - since discussing this with Yi at LinuxCon
> > > > > Beijing, I have a better idea of what's going on.
> > > > > 
> > > > > On POWER - or rather the "pseries" platform, which is paravirtualized.
> > > > > We can have multiple vIOMMU windows (usually 2) for a single virtual
> > > > 
> > > > On POWER, the DMA isolation is done by allocating different DMA window
> > > > to different isolation domains? And a single isolation domain may include
> > > > multiple dma windows? So with or withou IOMMU, there is only a single
> > > > DMA address shared by all the devices in the system? The isolation 
> > > > mechanism is as what described above?
> > > 
> > > No, the multiple windows are completely unrelated to how things are
> > > isolated.
> > 
> > I'm afraid I chose a wrong word by using "DMA window"..
> > Actually, when mentioning "DMA window", I mean address ranges in an iova
> > address space.
> 
> Yes, so did I.  My one window I mean one contiguous range of IOVA addresses.
> 
> > Anyhow, let me re-shape my understanding of POWER IOMMU and
> > make sure we are in the same page.
> > 
> > > 
> > > Just like on x86, each IOMMU domain has independent IOMMU mappings.
> > > The only difference is that IBM calls the domains "partitionable
> > > endpoints" (PEs) and they tend to be statically created at boot time,
> > > rather than runtime generated.
> > 
> > Does POWER IOMMU also have iova concept? Device can use an iova to
> > access memory, and IOMMU translates the iova to an address within the
> > system physical address?
> 
> Yes.  When I say the "PCI address space" I mean the IOVA space.
> 
> > > The windows are about what addresses in PCI space are translated by
> > > the IOMMU.  If the device generates a PCI cycle, only certain
> > > addresses will be mapped by the IOMMU to DMA - other addresses will
> > > correspond to other devices MMIOs, MSI vectors, maybe other things.
> > 
> > I guess the windows you mentioned here is the address ranges within the
> > system physical address space as you also mentioned MMIOs and etc.
> 
> No.  I mean ranges within the PCI space == IOVA space. It's simplest
> to understand with traditional PCI.  A cycle on the bus doesn't know
> whether the destination is a device or memory, it just has an address
> - a PCI memory address.  Part of that address range is mapped to
> system RAM, optionally with an IOMMU translating it.  Other parts of
> that address space are used for devices.
> 
> With PCI-E things get more complicated, but the conceptual model is
> the same.
> 
> > > The set of addresses translated by the IOMMU need not be contiguous.
> > 
> > I suppose you mean the output addresses of the IOMMU need not be
> > contiguous?
> 
> No.  I mean the input addresses of the IOMMU.
> 
> > > Or, there could be two IOMMUs on the bus, each accepting different
> > > address ranges.  These two situations are not distinguishable from the
> > > guest's point of view.
> > > 
> > > So for a typical PAPR setup, the device can access system RAM either
> > > via DMA in the range 0..1GiB (the "32-bit window") or in the range
> > > 2^59..2^59+<something> (the "64-bit window").  Typically the 32-bit
> > > window has mappings dynamically created by drivers, and the 64-bit
> > > window has all of system RAM mapped 1:1, but that's entirely up to the
> > > OS, it can map each window however it wants.
> > > 
> > > 32-bit devices (or "64 bit" devices which don't actually implement
> > > enough the address bits) will only be able to use the 32-bit window,
> > > of course.
> > > 
> > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the
> > > host bridge and other things exist outside those ranges.  Those are
> > > just the ranges which are used to DMA to RAM.
> > > 
> > > Each PE (domain) can see a different version of what's in each
> > > window.
> > 
> > If I'm correct so far. PE actually defines a mapping between an address
> > range of an address space(aka. iova address space) and an address range
> > of the system physical address space.
> 
> No.  A PE means several things, but basically it is an isolation
> domain, like an Intel IOMMU domain.  Each PE has an independent set of
> IOMMU mappings which translate part of the PCI address space to system
> memory space.
> 
> > Then my question is: does each PE define a separate iova address sapce
> > which is flat from 0 - 2^AW -1, AW is address width? As a reference,
> > VT-d domain defines a flat address space for each domain.
> 
> Partly.  Each PE has an address space which all devices in the PE see.
> Only some of that address space is mapped to system memory though,
> other parts are occupied by devices, others are unmapped.
> 
> Only the parts mapped by the IOMMU vary between PEs - the other parts
> of the address space will be identical for all PEs on the host

Thx, this comment addressed me well. This is different from what we have
on VT-d.

> bridge.  However for POWER guests (not for hosts) there is exactly one
> PE for each virtual host bridge.
> 
> > > In fact, if I understand the "IO hole" correctly, the situation on x86
> > > isn't very different.  It has a window below the IO hole and a second
> > > window above the IO hole.  The addresses within the IO hole go to
> > > (32-bit) devices on the PCI bus, rather than being translated by the
> > 
> > If you mean the "IO hole" within the system physcial address space, I think
> > it's yes.
> 
> Well, really I mean the IO hole in PCI address space.  Because system
> address space and PCI memory space were traditionally identity mapped
> on x86 this is easy to confuse though.
> 
> > > IOMMU to RAM addresses.  Because the gap is smaller between the two
> > > windows, I think we get away without really modelling this detail in
> > > qemu though.
> > > 
> > > > > PCI host bridge.  Because of the paravirtualization, the mapping to
> > > > > hardware is fuzzy, but for passthrough devices they will both be
> > > > > implemented by the IOMMU built into the physical host bridge.  That
> > > > > isn't importat to the guest, though - all operations happen at the
> > > > > window level.
> > > > 
> > > > On VT-d, with IOMMU presented, each isolation domain has its own address
> > > > space. That's why we talked more on address space level. And iommu makes
> > > > the difference. That's the behavioural characteristics a single iommu
> > > > translation unit has. And thus an IOMMUObject going to have.
> > > 
> > > Right, that's the same on POWER.  But the IOMMU only translates *some*
> > > addresses within the address space, not all of them.  The rest will go
> > > to other PCI devices or be unmapped, but won't go to RAM.
> > > 
> > > That's why the IOMMU should really be associated with an MR (or
> > > several MRs), not an AddressSpace, it only translates some addresses.
> > 
> > If I'm correct so far, I do believe the major difference between VT-d and
> > POWER IOMMU is that VT-d isolation domain is a flat address space while
> > PE of POWER is something different(need your input here as I'm not sure about
> > it). Maybe it's like there is a flat address space, each PE takes some address
> > ranges and maps the address ranges to different system physcial address ranges.
> 
> No, it's really not that different.  In both cases (without virt-SVM)
> there's a system memory address space, and a PCI address space for
> each domain / PE.  There are one or more "outbound" windows in system
> memory space that map system memory cycles to PCI cycles (used by the
> CPU to access MMIO) and one or more "inbound" (DMA) windows in PCI
> memory space which map PCI cycles onto system memory cycles (used by
> devices to access system memory).
> 
> On old-style PCs, both inbound and outbound windows were (mostly)
> identity maps.  On POWER they are not.
> 
> > > > > The other thing that bothers me here is the way it's attached to an
> > > > > AddressSpace.
> > > > 
> > > > My consideration is iommu handles AddressSpaces. dma address space is also
> > > > an address space managed by iommu.
> > > 
> > > No, it's not.  It's a region (or several) within the overall PCI
> > > address space.  Other things in the addressspace, such as other
> > > device's BARs exist independent of the IOMMU.
> > > 
> > > It's not something that could really work with PCI-E, I think, but
> > > with a more traditional PCI bus there's no reason you couldn't have
> > > multiple IOMMUs listening on different regions of the PCI address
> > > space.
> > 
> > I think the point here is on POWER, the input addresses of IOMMUs are actaully
> > in the same address space?
> 
> I'm not sure what you mean, but I don't think so.  Each PE has its own
> IOMMU input address space.
> 
> > What IOMMU does is mapping the different ranges to
> > different system physcial address ranges. So it's as you mentioned, multiple
> > IOMMUs listen on different regions of a PCI address space.
> 
> No.  That could be the case in theory, but it's not the usual case.
> 
> Or rather it depends what you mean by "an IOMMU".  For PAPR guests,
> both IOVA 0..1GiB and 2^59..(somewhere) are mapped to system memory,
> but with separate page tables.  You could consider that two IOMMUs (we
> mostly treat it that way in qemu).  However, all the mapping is
> handled by the same host bridge with 2 sets of page tables per PE, so
> you could also call it one IOMMU.
> 
> This is what I'm getting at when I say that "one IOMMU" is not a
> clearly defined unit.
> 
> > While for VT-d, it's not the case. The input addresses of IOMMUs may not
> > in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a
> > separate address space. So for VT-d, IOMMUs are actually listening to different
> > address spaces. That's the point why we want to have address space level
> > abstraction of IOMMU.
> > 
> > > 
> > > > That's why we believe it is fine to
> > > > associate dma address space with an IOMMUObject.
> > > 
> > > > >  IIUC how SVM works, the whole point is that the device
> > > > > no longer writes into a specific PCI address space.  Instead, it
> > > > > writes directly into a process address space.  So it seems to me more
> > > > > that SVM should operate at the PCI level, and disassociate the device
> > > > > from the normal PCI address space entirely, rather than hooking up
> > > > > something via that address space.

After thinking more, I agree that it is not suitable to hook up something for
1st level via the PCI address space. In the time 1st and 2nd level translation
is exposed to guest, a device would write to multiple address spaces. PCI address
space is only one of them. I think your reply in another email is a good start,
let me reply my thoughts under that email.

Regards,
Yi L

> > > > 
> > > > As Peter replied, we still need the PCI address space, it would be used
> > > > to build up the 2nd level page table which would be used in nested
> > > > translation.
> > > > 
> > > > Thanks,
> > > > Yi L
> > > > 
> > > > > 
> > > > 
> > > 
> > 
> > Regards,
> > Yi L
> > 
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
Liu, Yi L Dec. 20, 2017, 6:47 a.m. UTC | #22
On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > 
> > [...]
> > 
> > > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
> > > IOMMU MR and 1 AS per PCIe device, right?
> > 
> > I think this is the most tricky point - in QEMU IOMMU MR is not really
> > a 1:1 relationship to devices.  For Intel, it's true; for Power, it's
> > not.  On Power guests, one device's DMA address space can be splited
> > into different translation windows, while each window corresponds to
> > one IOMMU MR.
> 
> Right.
> 
> > So IMHO the real 1:1 mapping is between the device and its DMA address
> > space, rather than MRs.
> 
> That's not true either.  With both POWER and Intel, several devices
> can share a DMA address space: on POWER if they are in the same PE, on
> Intel if they are place in the same IOMMU domain.
> 
> On x86 and on POWER bare metal we generally try to make the minimum
> granularity for each PE/domain be a single function.  However, that
> may not be possible in the case of PCIe to PCI bridges, or
> multifunction devices where the functions aren't properly isolated
> from each other (e.g. function 0 debug registers which can affect
> other functions are quite common).
> 
> For POWER guests we only have one PE/domain per virtual host bridge.
> That's just a matter of implementation simplicity - if you want fine
> grained isolation you can just create more virtual host bridges.
> 
> > It's been a long time since when I drafted the patches.  I think at
> > least that should be a more general notifier mechanism comparing to
> > current IOMMUNotifier thing, which was bound to IOTLB notifies only.
> > AFAICT if we want to trap first-level translation changes, current
> > notifier is not even close to that interface - just see the definition
> > of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation
> > addresses, not anything else.  And IMHO that's why it's tightly bound
> > to MemoryRegions, and that's the root problem.  The dynamic IOMMU MR
> > switching problem is related to this issue as well.
> 
> So, having read and thought a bunch more, I think I know where you
> need to start hooking this in.  The thing is the current qemu PCI DMA
> structure assumes that each device belongs to just a single PCI
> address space - that's what pci_device_iommu_address_space() returns.
> 
> For virt-SVM that's just not true.  IIUC, a virt-SVM capable device
> could simultaneously write to multiple process address spaces, since
> the process IDs actually go over the bus.

Correct.

> 
> So trying to hook notifiers at the AddressSpace OR MemoryRegion level
> just doesn't make sense - if we've picked a single addresss space for
> the device, we've already made a wrong step.

That's also why we want to have notifiers based on IOMMUObject(may be
not a suitable name, let me use it as the patch named).

> 
> Instead what you need I think is something like:
> pci_device_virtsvm_context().  virt-SVM capable devices would need to
> call that *before* calling pci_device_iommu_address_space ().  Well
> rather the virt-SVM capable DMA helpers would need to call that.
> 
> That would return a new VirtSVMContext (or something) object, which
> would roughly correspond to a single PASID table.  That's where the
> methods and notifiers for managing that would need to go.

Correct, pci_device_iommu_address_space() returns an AS and it is
a PCI address space. And if pci_device_virtsvm_context() is also
called in vfio_realize(), it may not return an AS since there may
be no 1st level translation page table bound.

So as you said, return a new VirtSVMContext, this VirtSVMContext can
hook some new notifiers. I think the IOMMUObject introduced in this patch
can meet the requirement. But it may be re-named.

So here it addressed the concern you raised before which is hook IOMMUObject
via a PCI address space. Regards to VirtSVMContext, it may be a replacement
of IOMMUObject. As it is related to PASID, I'm considering to name it as
IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all
the IOMMU PASID related operations.

Regards,
Yi L

> 
> > I am not sure current "get IOMMU object from address space" solution
> > would be best, maybe it's "too bigger a scope", I think it depends on
> > whether in the future we'll have some requirement in such a bigger
> > scope (say, something we want to trap from vIOMMU and deliver it to
> > host IOMMU which may not even be device-related?  I don't know).  Now
> > another alternative I am thinking is, whether we can provide a
> > per-device notifier, then it can be bound to PCIDevice rather than
> > MemoryRegions, then it will be in device scope.
> 
> I think that sounds like a version of what I've suggested above.
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson Dec. 20, 2017, 11:01 a.m. UTC | #23
On Wed, Dec 20, 2017 at 02:32:42PM +0800, Liu, Yi L wrote:
> On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote:
> > On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote:
> > > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote:
> > > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
[snip]
> > > > So for a typical PAPR setup, the device can access system RAM either
> > > > via DMA in the range 0..1GiB (the "32-bit window") or in the range
> > > > 2^59..2^59+<something> (the "64-bit window").  Typically the 32-bit
> > > > window has mappings dynamically created by drivers, and the 64-bit
> > > > window has all of system RAM mapped 1:1, but that's entirely up to the
> > > > OS, it can map each window however it wants.
> > > > 
> > > > 32-bit devices (or "64 bit" devices which don't actually implement
> > > > enough the address bits) will only be able to use the 32-bit window,
> > > > of course.
> > > > 
> > > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the
> > > > host bridge and other things exist outside those ranges.  Those are
> > > > just the ranges which are used to DMA to RAM.
> > > > 
> > > > Each PE (domain) can see a different version of what's in each
> > > > window.
> > > 
> > > If I'm correct so far. PE actually defines a mapping between an address
> > > range of an address space(aka. iova address space) and an address range
> > > of the system physical address space.
> > 
> > No.  A PE means several things, but basically it is an isolation
> > domain, like an Intel IOMMU domain.  Each PE has an independent set of
> > IOMMU mappings which translate part of the PCI address space to system
> > memory space.
> > 
> > > Then my question is: does each PE define a separate iova address sapce
> > > which is flat from 0 - 2^AW -1, AW is address width? As a reference,
> > > VT-d domain defines a flat address space for each domain.
> > 
> > Partly.  Each PE has an address space which all devices in the PE see.
> > Only some of that address space is mapped to system memory though,
> > other parts are occupied by devices, others are unmapped.
> > 
> > Only the parts mapped by the IOMMU vary between PEs - the other parts
> > of the address space will be identical for all PEs on the host
> 
> Thx, this comment addressed me well. This is different from what we have
> on VT-d.

Really?  That's hard to believe.  I'm pretty sure the VT-d IOMMU must
have a range < 2^64, and anything on the bus outside that range I
expect would be common between all domains.  In particular I'd expect
the BARs for other devices not to be remapped by the IOMMU (though
they may be inaccessible on PCI-E due peer to peer transactions being
blocked).  As well as things above the IOMMU's range, I'd expect the
region for 32-bit BARs to be common between all domains.

[snip]
> > > > > >  IIUC how SVM works, the whole point is that the device
> > > > > > no longer writes into a specific PCI address space.  Instead, it
> > > > > > writes directly into a process address space.  So it seems to me more
> > > > > > that SVM should operate at the PCI level, and disassociate the device
> > > > > > from the normal PCI address space entirely, rather than hooking up
> > > > > > something via that address space.
> 
> After thinking more, I agree that it is not suitable to hook up something for
> 1st level via the PCI address space. In the time 1st and 2nd level translation
> is exposed to guest, a device would write to multiple address spaces. PCI address
> space is only one of them. I think your reply in another email is a good start,
> let me reply my thoughts under that email.
> 
> Regards,
> Yi L
> 
> > > > > 
> > > > > As Peter replied, we still need the PCI address space, it would be used
> > > > > to build up the 2nd level page table which would be used in nested
> > > > > translation.
> > > > > 
> > > > > Thanks,
> > > > > Yi L
> > > > > 
> > > > > > 
> > > > > 
> > > > 
> > > 
> > > Regards,
> > > Yi L
> > > 
> > 
> 
>
David Gibson Dec. 20, 2017, 11:18 a.m. UTC | #24
On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > 
> > > [...]
> > > 
> > > > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1
> > > > IOMMU MR and 1 AS per PCIe device, right?
> > > 
> > > I think this is the most tricky point - in QEMU IOMMU MR is not really
> > > a 1:1 relationship to devices.  For Intel, it's true; for Power, it's
> > > not.  On Power guests, one device's DMA address space can be splited
> > > into different translation windows, while each window corresponds to
> > > one IOMMU MR.
> > 
> > Right.
> > 
> > > So IMHO the real 1:1 mapping is between the device and its DMA address
> > > space, rather than MRs.
> > 
> > That's not true either.  With both POWER and Intel, several devices
> > can share a DMA address space: on POWER if they are in the same PE, on
> > Intel if they are place in the same IOMMU domain.
> > 
> > On x86 and on POWER bare metal we generally try to make the minimum
> > granularity for each PE/domain be a single function.  However, that
> > may not be possible in the case of PCIe to PCI bridges, or
> > multifunction devices where the functions aren't properly isolated
> > from each other (e.g. function 0 debug registers which can affect
> > other functions are quite common).
> > 
> > For POWER guests we only have one PE/domain per virtual host bridge.
> > That's just a matter of implementation simplicity - if you want fine
> > grained isolation you can just create more virtual host bridges.
> > 
> > > It's been a long time since when I drafted the patches.  I think at
> > > least that should be a more general notifier mechanism comparing to
> > > current IOMMUNotifier thing, which was bound to IOTLB notifies only.
> > > AFAICT if we want to trap first-level translation changes, current
> > > notifier is not even close to that interface - just see the definition
> > > of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation
> > > addresses, not anything else.  And IMHO that's why it's tightly bound
> > > to MemoryRegions, and that's the root problem.  The dynamic IOMMU MR
> > > switching problem is related to this issue as well.
> > 
> > So, having read and thought a bunch more, I think I know where you
> > need to start hooking this in.  The thing is the current qemu PCI DMA
> > structure assumes that each device belongs to just a single PCI
> > address space - that's what pci_device_iommu_address_space() returns.
> > 
> > For virt-SVM that's just not true.  IIUC, a virt-SVM capable device
> > could simultaneously write to multiple process address spaces, since
> > the process IDs actually go over the bus.
> 
> Correct.
> 
> > So trying to hook notifiers at the AddressSpace OR MemoryRegion level
> > just doesn't make sense - if we've picked a single addresss space for
> > the device, we've already made a wrong step.
> 
> That's also why we want to have notifiers based on IOMMUObject(may be
> not a suitable name, let me use it as the patch named).

Right, I think "IOMMUObject" is a misleading name.

> > Instead what you need I think is something like:
> > pci_device_virtsvm_context().  virt-SVM capable devices would need to
> > call that *before* calling pci_device_iommu_address_space ().  Well
> > rather the virt-SVM capable DMA helpers would need to call that.
> > 
> > That would return a new VirtSVMContext (or something) object, which
> > would roughly correspond to a single PASID table.  That's where the
> > methods and notifiers for managing that would need to go.
> 
> Correct, pci_device_iommu_address_space() returns an AS and it is
> a PCI address space. And if pci_device_virtsvm_context() is also
> called in vfio_realize(), it may not return an AS since there may
> be no 1st level translation page table bound.
> 
> So as you said, return a new VirtSVMContext, this VirtSVMContext can
> hook some new notifiers. I think the IOMMUObject introduced in this patch
> can meet the requirement. But it may be re-named.

Ok.

> So here it addressed the concern you raised before which is hook IOMMUObject
> via a PCI address space. Regards to VirtSVMContext, it may be a replacement
> of IOMMUObject. As it is related to PASID, I'm considering to name it as
> IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all
> the IOMMU PASID related operations.

I'm ok with calling it a "PASID context".

Thinking about this some more, here are some extra observations:

 * I think each device needs both a PASID context and an ordinary
   address space.  The PASID context would be used for bus
   transactions which include a process id, the address space for
   those that don't.

 * Theoretically, the PASID context could be modelled as an array/map
   of AddressSpace objects for each process ID.  However, creating all
   those AddressSpace objects in advance might be too expensive.  I
   can see a couple of options to avoid this:

1) Have the PASID context class include a 'translate' method similar
to the one in IOMMUMemoryRegionClass, but taking a process ID as well
as an address.  This would avoid creating extra AddressSpace objects,
but might require duplicating a bunch of the translation code that
already exists for AddressSpace.

2) "Lazily" create AddressSpace objects.  The generic part of the
PASID aware DMA helper functions would use a cache of AddressSpace's
for particular process IDs, using the AddressSpace (and MemoryRegion
within) to translate accesses for a particular process ID.  However,
these AddressSpace and MemoryRegion objects would only be created when
the device first accesses that address space.  In the common case,
where a single device is just being used by a single process or a
small number, this should keep the number of AddressSpace objects
relatively small.  Obviously the cache would need to be invalidated,
cleaning up the AddressSpace objects, when the PASID table is altered.

 * I realize that the expected case here is with KVM, where the guest
   controls the first level translation, but the host controls the
   second level translation.  However, we should also be able to model
   the case where the guest controls both levels for the sake of full
   system emulation.  I think understanding this case will lead to a
   better design even for the simpler case.

Do you have a plan for what the virt-SVM aware DMA functions will look
like?
Liu, Yi L Dec. 21, 2017, 8:40 a.m. UTC | #25
On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote:
> On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > > 
> > > > 
[...]
> > > So, having read and thought a bunch more, I think I know where you
> > > need to start hooking this in.  The thing is the current qemu PCI DMA
> > > structure assumes that each device belongs to just a single PCI
> > > address space - that's what pci_device_iommu_address_space() returns.
> > > 
> > > For virt-SVM that's just not true.  IIUC, a virt-SVM capable device
> > > could simultaneously write to multiple process address spaces, since
> > > the process IDs actually go over the bus.
> > 
> > Correct.
> > 
> > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level
> > > just doesn't make sense - if we've picked a single addresss space for
> > > the device, we've already made a wrong step.
> > 
> > That's also why we want to have notifiers based on IOMMUObject(may be
> > not a suitable name, let me use it as the patch named).
> 
> Right, I think "IOMMUObject" is a misleading name.
> 
> > > Instead what you need I think is something like:
> > > pci_device_virtsvm_context().  virt-SVM capable devices would need to
> > > call that *before* calling pci_device_iommu_address_space ().  Well
> > > rather the virt-SVM capable DMA helpers would need to call that.
> > > 
> > > That would return a new VirtSVMContext (or something) object, which
> > > would roughly correspond to a single PASID table.  That's where the
> > > methods and notifiers for managing that would need to go.
> > 
> > Correct, pci_device_iommu_address_space() returns an AS and it is
> > a PCI address space. And if pci_device_virtsvm_context() is also
> > called in vfio_realize(), it may not return an AS since there may
> > be no 1st level translation page table bound.
> > 
> > So as you said, return a new VirtSVMContext, this VirtSVMContext can
> > hook some new notifiers. I think the IOMMUObject introduced in this patch
> > can meet the requirement. But it may be re-named.
> 
> Ok.
> 
> > So here it addressed the concern you raised before which is hook IOMMUObject
> > via a PCI address space. Regards to VirtSVMContext, it may be a replacement
> > of IOMMUObject. As it is related to PASID, I'm considering to name it as
> > IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all
> > the IOMMU PASID related operations.
> 
> I'm ok with calling it a "PASID context".
> 
> Thinking about this some more, here are some extra observations:
> 
>  * I think each device needs both a PASID context and an ordinary
>    address space.  The PASID context would be used for bus
>    transactions which include a process id, the address space for
>    those that don't.

Correct. Also virt-SVM still needs the PCI Address space. And the PCI
Address space == Guest physical Address space. For virt-SVM, requires
pt mode to ensure the nested translation.

> 
>  * Theoretically, the PASID context could be modelled as an array/map
>    of AddressSpace objects for each process ID.  However, creating all

I also thought about creating AddressSpace objects for each process ID.
But I don't think we need to do it. My reason as below:

In theory, it is correct to have AddressSpace object for each process
virtual address space in Qemu, and this is what we are doing for PCI
address space so far. However, this is necessary when we want to mirror
the mapping to host. Each time there is mapping changed within the PCI
address space, we need to mirror it to host.

While for virt-SVM, we don't need to mirror the changes within the guest
process address space. The nested translation capability in HW brings us
a convenience. In nested translation, HW can access a guest PASID table
with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or
maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table,
it is not necessary to mirror it to host. Based on the statements above,
there is a candidate function to be included in PASIDContext. It could be
bind_guest_pasid_table(). And be used to set the guest PASID Table to host
translation structure when guest finds a device is has SVM capability.

>    those AddressSpace objects in advance might be too expensive.  I
>    can see a couple of options to avoid this:
> 
> 1) Have the PASID context class include a 'translate' method similar
> to the one in IOMMUMemoryRegionClass, but taking a process ID as well
> as an address.  This would avoid creating extra AddressSpace objects,
> but might require duplicating a bunch of the translation code that
> already exists for AddressSpace.

Translation code actually walks the guest CR3 table to get a GVA->GPA map.
But it is not really required so far due to HW capability.

> 
> 2) "Lazily" create AddressSpace objects.  The generic part of the
> PASID aware DMA helper functions would use a cache of AddressSpace's
> for particular process IDs, using the AddressSpace (and MemoryRegion
> within) to translate accesses for a particular process ID.  However,
> these AddressSpace and MemoryRegion objects would only be created when
> the device first accesses that address space.  In the common case,
> where a single device is just being used by a single process or a
> small number, this should keep the number of AddressSpace objects
> relatively small.  Obviously the cache would need to be invalidated,
> cleaning up the AddressSpace objects, when the PASID table is altered.

This "Lazily" mechanism is not required. But I agree with the last statement.
When PASID Table altered, the cache should be invalidated. Additionally, the
cache for the mappings(GVA) within the guest process address space should also
be invalidated when there is change. This also calls for a candidate function
in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate
GVA cache and also PASID entry cache.

> 
>  * I realize that the expected case here is with KVM, where the guest
>    controls the first level translation, but the host controls the
>    second level translation.  However, we should also be able to model

Yeah, this is the case. And to be accurate, both the 1st level and 2nd level
translation here happens on HW IOMMU. The 1st level page table is actually
like "linked" from guest. And the 2nd level page table is created/conrolled
by Qemu(with MAP/UNMAP).

>    the case where the guest controls both levels for the sake of full
>    system emulation.  I think understanding this case will lead to a
>    better design even for the simpler case.

Not sure if I got your point accurately. Let me try to reply based on
what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and
virt-SVM. I think this is a case which guest controls both 1st and 2nd
level translation.

For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka.
shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough.
Peter has already upstreamed it.

> 
> Do you have a plan for what the virt-SVM aware DMA functions will look
> like?

I think there are two candidates so far. This should be enough for VT-d and
AMD-IOMMU. For ARM-SMMU, it may need extra function.
* bind_guest_pasid_table()
* svm_invalidate_tlb()

> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

Thanks,
Yi L
Liu, Yi L Dec. 22, 2017, 6:47 a.m. UTC | #26
On Wed, Dec 20, 2017 at 10:01:10PM +1100, David Gibson wrote:
> On Wed, Dec 20, 2017 at 02:32:42PM +0800, Liu, Yi L wrote:
> > On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote:
> > > On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote:
> > > > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote:
> > > > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote:
[snip]
> > > Partly.  Each PE has an address space which all devices in the PE see.
> > > Only some of that address space is mapped to system memory though,
> > > other parts are occupied by devices, others are unmapped.
> > > 
> > > Only the parts mapped by the IOMMU vary between PEs - the other parts
> > > of the address space will be identical for all PEs on the host
> > 
> > Thx, this comment addressed me well. This is different from what we have
> > on VT-d.
> 
> Really?  That's hard to believe.  I'm pretty sure the VT-d IOMMU must
> have a range < 2^64, and anything on the bus outside that range I
> expect would be common between all domains.  In particular I'd expect
> the BARs for other devices not to be remapped by the IOMMU (though
> they may be inaccessible on PCI-E due peer to peer transactions being
> blocked).  As well as things above the IOMMU's range, I'd expect the
> region for 32-bit BARs to be common between all domains.

Sorry I misunderstood you. In each IOVA space, there is reserved range
, it is the BARs MMIO range. Such reservation is to avoid un-expected
Peer-To-Peer transaction. So regards to the IOVA space, all vendors should
be similar. So you are right~

Thanks,
Yi L
David Gibson Jan. 3, 2018, 12:28 a.m. UTC | #27
On Thu, Dec 21, 2017 at 04:40:19PM +0800, Liu, Yi L wrote:
> On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote:
> > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > > > 
> > > > > 
> [...]
> > > > So, having read and thought a bunch more, I think I know where you
> > > > need to start hooking this in.  The thing is the current qemu PCI DMA
> > > > structure assumes that each device belongs to just a single PCI
> > > > address space - that's what pci_device_iommu_address_space() returns.
> > > > 
> > > > For virt-SVM that's just not true.  IIUC, a virt-SVM capable device
> > > > could simultaneously write to multiple process address spaces, since
> > > > the process IDs actually go over the bus.
> > > 
> > > Correct.
> > > 
> > > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level
> > > > just doesn't make sense - if we've picked a single addresss space for
> > > > the device, we've already made a wrong step.
> > > 
> > > That's also why we want to have notifiers based on IOMMUObject(may be
> > > not a suitable name, let me use it as the patch named).
> > 
> > Right, I think "IOMMUObject" is a misleading name.
> > 
> > > > Instead what you need I think is something like:
> > > > pci_device_virtsvm_context().  virt-SVM capable devices would need to
> > > > call that *before* calling pci_device_iommu_address_space ().  Well
> > > > rather the virt-SVM capable DMA helpers would need to call that.
> > > > 
> > > > That would return a new VirtSVMContext (or something) object, which
> > > > would roughly correspond to a single PASID table.  That's where the
> > > > methods and notifiers for managing that would need to go.
> > > 
> > > Correct, pci_device_iommu_address_space() returns an AS and it is
> > > a PCI address space. And if pci_device_virtsvm_context() is also
> > > called in vfio_realize(), it may not return an AS since there may
> > > be no 1st level translation page table bound.
> > > 
> > > So as you said, return a new VirtSVMContext, this VirtSVMContext can
> > > hook some new notifiers. I think the IOMMUObject introduced in this patch
> > > can meet the requirement. But it may be re-named.
> > 
> > Ok.
> > 
> > > So here it addressed the concern you raised before which is hook IOMMUObject
> > > via a PCI address space. Regards to VirtSVMContext, it may be a replacement
> > > of IOMMUObject. As it is related to PASID, I'm considering to name it as
> > > IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all
> > > the IOMMU PASID related operations.
> > 
> > I'm ok with calling it a "PASID context".
> > 
> > Thinking about this some more, here are some extra observations:
> > 
> >  * I think each device needs both a PASID context and an ordinary
> >    address space.  The PASID context would be used for bus
> >    transactions which include a process id, the address space for
> >    those that don't.
> 
> Correct. Also virt-SVM still needs the PCI Address space. And the PCI
> Address space == Guest physical Address space.

Not necessarily.  That's true if you're only making the L1 translation
guest visible.  But I think we need to at least think about the case
where both L1 and L2 translations are guest visible, in which case the
PCI address space is not the same as the guest physical address space.

> For virt-SVM, requires
> pt mode to ensure the nested translation.

What is pt mode?

> >  * Theoretically, the PASID context could be modelled as an array/map
> >    of AddressSpace objects for each process ID.  However, creating all
> 
> I also thought about creating AddressSpace objects for each process ID.
> But I don't think we need to do it. My reason as below:
> 
> In theory, it is correct to have AddressSpace object for each process
> virtual address space in Qemu, and this is what we are doing for PCI
> address space so far. However, this is necessary when we want to mirror
> the mapping to host. Each time there is mapping changed within the PCI
> address space, we need to mirror it to host.
> 
> While for virt-SVM, we don't need to mirror the changes within the guest
> process address space. The nested translation capability in HW brings us
> a convenience. In nested translation, HW can access a guest PASID table
> with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or
> maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table,
> it is not necessary to mirror it to host. Based on the statements above,
> there is a candidate function to be included in PASIDContext. It could be
> bind_guest_pasid_table(). And be used to set the guest PASID Table to host
> translation structure when guest finds a device is has SVM
> capability.

That's true for passthrough devices, but not for qemu emulated
devices.  None of those support SVM yet, but there's no reason they
couldn't in future.

Even though that will never be the main production case, I think we'll
get a better model if we think about these edge cases carefully.

> >    those AddressSpace objects in advance might be too expensive.  I
> >    can see a couple of options to avoid this:
> > 
> > 1) Have the PASID context class include a 'translate' method similar
> > to the one in IOMMUMemoryRegionClass, but taking a process ID as well
> > as an address.  This would avoid creating extra AddressSpace objects,
> > but might require duplicating a bunch of the translation code that
> > already exists for AddressSpace.
> 
> Translation code actually walks the guest CR3 table to get a
> GVA->GPA map.

Right, but that's clearly x86 specific.

> But it is not really required so far due to HW capability.
> 
> > 
> > 2) "Lazily" create AddressSpace objects.  The generic part of the
> > PASID aware DMA helper functions would use a cache of AddressSpace's
> > for particular process IDs, using the AddressSpace (and MemoryRegion
> > within) to translate accesses for a particular process ID.  However,
> > these AddressSpace and MemoryRegion objects would only be created when
> > the device first accesses that address space.  In the common case,
> > where a single device is just being used by a single process or a
> > small number, this should keep the number of AddressSpace objects
> > relatively small.  Obviously the cache would need to be invalidated,
> > cleaning up the AddressSpace objects, when the PASID table is altered.
> 
> This "Lazily" mechanism is not required. But I agree with the last statement.
> When PASID Table altered, the cache should be invalidated. Additionally, the
> cache for the mappings(GVA) within the guest process address space should also
> be invalidated when there is change. This also calls for a candidate function
> in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate
> GVA cache and also PASID entry cache.
> 
> > 
> >  * I realize that the expected case here is with KVM, where the guest
> >    controls the first level translation, but the host controls the
> >    second level translation.  However, we should also be able to model
> 
> Yeah, this is the case. And to be accurate, both the 1st level and 2nd level
> translation here happens on HW IOMMU. The 1st level page table is actually
> like "linked" from guest. And the 2nd level page table is created/conrolled
> by Qemu(with MAP/UNMAP).
> 
> >    the case where the guest controls both levels for the sake of full
> >    system emulation.  I think understanding this case will lead to a
> >    better design even for the simpler case.
> 
> Not sure if I got your point accurately. Let me try to reply based on
> what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and
> virt-SVM. I think this is a case which guest controls both 1st and 2nd
> level translation.

That sounds right, from my limited knowledge of the x86 IOMMU.

> For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka.
> shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough.
> Peter has already upstreamed it.

Right, but we also need to model the translations for emulated
devices.

> > Do you have a plan for what the virt-SVM aware DMA functions will look
> > like?
> 
> I think there are two candidates so far. This should be enough for VT-d and
> AMD-IOMMU. For ARM-SMMU, it may need extra function.
> * bind_guest_pasid_table()
> * svm_invalidate_tlb()

That's not really what I meant.  What I was getting it if you were
writing an emulated device which supported SVM, what would the
functions to perform SVM-aware DMA look like?
Liu, Yi L Jan. 4, 2018, 9:40 a.m. UTC | #28
On Wed, Jan 03, 2018 at 11:28:17AM +1100, David Gibson wrote:
> On Thu, Dec 21, 2017 at 04:40:19PM +0800, Liu, Yi L wrote:
> > On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote:
> > > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> > > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > > > > 
> > > > > > 
[...]
> > > 
> > > I'm ok with calling it a "PASID context".
> > > 
> > > Thinking about this some more, here are some extra observations:
> > > 
> > >  * I think each device needs both a PASID context and an ordinary
> > >    address space.  The PASID context would be used for bus
> > >    transactions which include a process id, the address space for
> > >    those that don't.
> > 
> > Correct. Also virt-SVM still needs the PCI Address space. And the PCI
> > Address space == Guest physical Address space.
> 
> Not necessarily.  That's true if you're only making the L1 translation
> guest visible.  But I think we need to at least think about the case
> where both L1 and L2 translations are guest visible, in which case the
> PCI address space is not the same as the guest physical address space.
> 
> > For virt-SVM, requires
> > pt mode to ensure the nested translation.
> 
> What is pt mode?

The pt mode here means the kernel parameter "iommu=pt" which means iommu
do 1:1 mapping for iova. For t virt-SVM on VT-d, with guest set iommu=pt,
the 2nd level page table in host would be a GPA->HPA mapping. If not, the
host 2nd level page table would be a GIOVA->HPA mapping which is not expected
in nested translation.

> > >  * Theoretically, the PASID context could be modelled as an array/map
> > >    of AddressSpace objects for each process ID.  However, creating all
> > 
> > I also thought about creating AddressSpace objects for each process ID.
> > But I don't think we need to do it. My reason as below:
> > 
> > In theory, it is correct to have AddressSpace object for each process
> > virtual address space in Qemu, and this is what we are doing for PCI
> > address space so far. However, this is necessary when we want to mirror
> > the mapping to host. Each time there is mapping changed within the PCI
> > address space, we need to mirror it to host.
> > 
> > While for virt-SVM, we don't need to mirror the changes within the guest
> > process address space. The nested translation capability in HW brings us
> > a convenience. In nested translation, HW can access a guest PASID table
> > with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or
> > maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table,
> > it is not necessary to mirror it to host. Based on the statements above,
> > there is a candidate function to be included in PASIDContext. It could be
> > bind_guest_pasid_table(). And be used to set the guest PASID Table to host
> > translation structure when guest finds a device is has SVM
> > capability.
> 
> That's true for passthrough devices, but not for qemu emulated
> devices.  None of those support SVM yet, but there's no reason they
> couldn't in future.
> 
> Even though that will never be the main production case, I think we'll
> get a better model if we think about these edge cases carefully.

Yeah, it's a quite good edge case. If an emulated device want to do DMA
to a guest process virtual address space, Qemu needs to know it and do
necessary address translation for it just as iova. If we want to support
emulated SVM capable device, not sure if the current AddressSpace in Qemu
can fit well. Honestly, SVM capable devices are major complicated
accelerators. So I think we need to balance the efort. But I agree it is
helpful to understand more about the edge cases.

> > >    those AddressSpace objects in advance might be too expensive.  I
> > >    can see a couple of options to avoid this:
> > > 
> > > 1) Have the PASID context class include a 'translate' method similar
> > > to the one in IOMMUMemoryRegionClass, but taking a process ID as well
> > > as an address.  This would avoid creating extra AddressSpace objects,
> > > but might require duplicating a bunch of the translation code that
> > > already exists for AddressSpace.
> > 
> > Translation code actually walks the guest CR3 table to get a
> > GVA->GPA map.
> 
> Right, but that's clearly x86 specific.

Yes, CR3 table is x86 specific. While for page table walking, I think
it can be vendor-agnostic.

> 
> > But it is not really required so far due to HW capability.
> > 
> > > 
> > > 2) "Lazily" create AddressSpace objects.  The generic part of the
> > > PASID aware DMA helper functions would use a cache of AddressSpace's
> > > for particular process IDs, using the AddressSpace (and MemoryRegion
> > > within) to translate accesses for a particular process ID.  However,
> > > these AddressSpace and MemoryRegion objects would only be created when
> > > the device first accesses that address space.  In the common case,
> > > where a single device is just being used by a single process or a
> > > small number, this should keep the number of AddressSpace objects
> > > relatively small.  Obviously the cache would need to be invalidated,
> > > cleaning up the AddressSpace objects, when the PASID table is altered.
> > 
> > This "Lazily" mechanism is not required. But I agree with the last statement.
> > When PASID Table altered, the cache should be invalidated. Additionally, the
> > cache for the mappings(GVA) within the guest process address space should also
> > be invalidated when there is change. This also calls for a candidate function
> > in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate
> > GVA cache and also PASID entry cache.
> > 
> > > 
> > >  * I realize that the expected case here is with KVM, where the guest
> > >    controls the first level translation, but the host controls the
> > >    second level translation.  However, we should also be able to model
> > 
> > Yeah, this is the case. And to be accurate, both the 1st level and 2nd level
> > translation here happens on HW IOMMU. The 1st level page table is actually
> > like "linked" from guest. And the 2nd level page table is created/conrolled
> > by Qemu(with MAP/UNMAP).
> > 
> > >    the case where the guest controls both levels for the sake of full
> > >    system emulation.  I think understanding this case will lead to a
> > >    better design even for the simpler case.
> > 
> > Not sure if I got your point accurately. Let me try to reply based on
> > what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and
> > virt-SVM. I think this is a case which guest controls both 1st and 2nd
> > level translation.
> 
> That sounds right, from my limited knowledge of the x86 IOMMU.
> 
> > For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka.
> > shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough.
> > Peter has already upstreamed it.
> 
> Right, but we also need to model the translations for emulated
> devices.

For emulated device, I think the virt-IOVA is workable with latest Qemu.

> 
> > > Do you have a plan for what the virt-SVM aware DMA functions will look
> > > like?
> > 
> > I think there are two candidates so far. This should be enough for VT-d and
> > AMD-IOMMU. For ARM-SMMU, it may need extra function.
> > * bind_guest_pasid_table()
> > * svm_invalidate_tlb()
> 
> That's not really what I meant.  What I was getting it if you were
> writing an emulated device which supported SVM, what would the
> functions to perform SVM-aware DMA look like?

Sorry for the misunderstanding, I'm not writing an emulated device with SVM
capability. For a SVM capable device, in brief, it needs to be able to send
Memory Request with PASID. If an emulated device tries to access a process
virtual address space, Qemu needs to know it and do the translation for it.
That's the SVM-awar DAM should look like. However, I haven't got a model for
it. Probably needs to add pasid field in AddressSpace.

> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

Regards,
Yi L
Liu, Yi L Jan. 12, 2018, 10:25 a.m. UTC | #29
On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote:
> On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > > 

[...]

Sorry for the delayed reply, spent some time on reconsidering your comments.

> 
> I'm ok with calling it a "PASID context".
> 
> Thinking about this some more, here are some extra observations:
> 
>  * I think each device needs both a PASID context and an ordinary
>    address space.  The PASID context would be used for bus
>    transactions which include a process id, the address space for
>    those that don't.
> 
>  * Theoretically, the PASID context could be modelled as an array/map
>    of AddressSpace objects for each process ID.  However, creating all
>    those AddressSpace objects in advance might be too expensive.  I
>    can see a couple of options to avoid this:
> 
> 1) Have the PASID context class include a 'translate' method similar
> to the one in IOMMUMemoryRegionClass, but taking a process ID as well
> as an address.  This would avoid creating extra AddressSpace objects,
> but might require duplicating a bunch of the translation code that
> already exists for AddressSpace.
> 
> 2) "Lazily" create AddressSpace objects.  The generic part of the
> PASID aware DMA helper functions would use a cache of AddressSpace's
> for particular process IDs, using the AddressSpace (and MemoryRegion
> within) to translate accesses for a particular process ID.  However,
> these AddressSpace and MemoryRegion objects would only be created when
> the device first accesses that address space.  In the common case,
> where a single device is just being used by a single process or a
> small number, this should keep the number of AddressSpace objects
> relatively small.  Obviously the cache would need to be invalidated,
> cleaning up the AddressSpace objects, when the PASID table is altered.

Sorry, a double check here. Does "AddressSpace objects" mean the existing
AddressSpace definition in Qemu?

> 
>  * I realize that the expected case here is with KVM, where the guest
>    controls the first level translation, but the host controls the
>    second level translation.  However, we should also be able to model
>    the case where the guest controls both levels for the sake of full
>    system emulation.  I think understanding this case will lead to a
>    better design even for the simpler case.
> 
> Do you have a plan for what the virt-SVM aware DMA functions will look
> like?

The behaviour is device specific.
For a SVM capable physcial device, it would store the pasid value in a
register locates in the deivce. e.g. a GPU context can be set to use SVM,
after the pasid is set, any DMA from this context is DMAs target to a
process virtual address space.

So for a virt-SVM aware DMA device, the device model needs to figure out
the target address space. With the correct address space, then consume
the translate() callback provided by iommu emulator. And then emulate the
DMA operation for the emulated device.

I'll try to get a new version with your suggestions.

Thanks,
Yi L
David Gibson Jan. 16, 2018, 6:04 a.m. UTC | #30
On Fri, Jan 12, 2018 at 06:25:34PM +0800, Liu, Yi L wrote:
> On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote:
> > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote:
> > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote:
> > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote:
> > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote:
> > > > > 
> 
> [...]
> 
> Sorry for the delayed reply, spent some time on reconsidering your comments.
> 
> > 
> > I'm ok with calling it a "PASID context".
> > 
> > Thinking about this some more, here are some extra observations:
> > 
> >  * I think each device needs both a PASID context and an ordinary
> >    address space.  The PASID context would be used for bus
> >    transactions which include a process id, the address space for
> >    those that don't.
> > 
> >  * Theoretically, the PASID context could be modelled as an array/map
> >    of AddressSpace objects for each process ID.  However, creating all
> >    those AddressSpace objects in advance might be too expensive.  I
> >    can see a couple of options to avoid this:
> > 
> > 1) Have the PASID context class include a 'translate' method similar
> > to the one in IOMMUMemoryRegionClass, but taking a process ID as well
> > as an address.  This would avoid creating extra AddressSpace objects,
> > but might require duplicating a bunch of the translation code that
> > already exists for AddressSpace.
> > 
> > 2) "Lazily" create AddressSpace objects.  The generic part of the
> > PASID aware DMA helper functions would use a cache of AddressSpace's
> > for particular process IDs, using the AddressSpace (and MemoryRegion
> > within) to translate accesses for a particular process ID.  However,
> > these AddressSpace and MemoryRegion objects would only be created when
> > the device first accesses that address space.  In the common case,
> > where a single device is just being used by a single process or a
> > small number, this should keep the number of AddressSpace objects
> > relatively small.  Obviously the cache would need to be invalidated,
> > cleaning up the AddressSpace objects, when the PASID table is altered.
> 
> Sorry, a double check here. Does "AddressSpace objects" mean the existing
> AddressSpace definition in Qemu?

Yes.

> >  * I realize that the expected case here is with KVM, where the guest
> >    controls the first level translation, but the host controls the
> >    second level translation.  However, we should also be able to model
> >    the case where the guest controls both levels for the sake of full
> >    system emulation.  I think understanding this case will lead to a
> >    better design even for the simpler case.
> > 
> > Do you have a plan for what the virt-SVM aware DMA functions will look
> > like?
> 
> The behaviour is device specific.
> For a SVM capable physcial device, it would store the pasid value in a
> register locates in the deivce. e.g. a GPU context can be set to use SVM,
> after the pasid is set, any DMA from this context is DMAs target to a
> process virtual address space.

That doesn't sound any more device specific than any DMA operation,
and we have helpers for that.

> So for a virt-SVM aware DMA device, the device model needs to figure out
> the target address space. With the correct address space, then consume
> the translate() callback provided by iommu emulator. And then emulate the
> DMA operation for the emulated device.

Nearly all of that sounds like something that belongs in a helper
function.  Basically a varaint of dma_memory_rw() (and related
functions) that takes a PASID as well as an address.

> I'll try to get a new version with your suggestions.
diff mbox

Patch

diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
index f8d7a4a..d688412 100644
--- a/hw/core/Makefile.objs
+++ b/hw/core/Makefile.objs
@@ -5,6 +5,7 @@  common-obj-y += fw-path-provider.o
 # irq.o needed for qdev GPIO handling:
 common-obj-y += irq.o
 common-obj-y += hotplug.o
+common-obj-y += iommu.o
 common-obj-y += nmi.o
 
 common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
diff --git a/hw/core/iommu.c b/hw/core/iommu.c
new file mode 100644
index 0000000..7c4fcfe
--- /dev/null
+++ b/hw/core/iommu.c
@@ -0,0 +1,58 @@ 
+/*
+ * QEMU emulation of IOMMU logic
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * Authors: Peter Xu <peterx@redhat.com>,
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/core/iommu.h"
+
+void iommu_notifier_register(IOMMUObject *iommu,
+                             IOMMUNotifier *n,
+                             IOMMUNotifyFn fn,
+                             IOMMUEvent event)
+{
+    n->event = event;
+    n->iommu_notify = fn;
+    QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node);
+    return;
+}
+
+void iommu_notifier_unregister(IOMMUObject *iommu,
+                               IOMMUNotifier *notifier)
+{
+    IOMMUNotifier *cur, *next;
+
+    QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) {
+        if (cur == notifier) {
+            QLIST_REMOVE(cur, node);
+            break;
+        }
+    }
+}
+
+void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data)
+{
+    IOMMUNotifier *cur;
+
+    QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) {
+        if ((cur->event == event_data->event) && cur->iommu_notify) {
+            cur->iommu_notify(cur, event_data);
+        }
+    }
+}
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 03595e3..8350973 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -26,6 +26,7 @@ 
 #include "qom/object.h"
 #include "qemu/rcu.h"
 #include "hw/qdev-core.h"
+#include "hw/core/iommu.h"
 
 #define RAM_ADDR_INVALID (~(ram_addr_t)0)
 
@@ -301,6 +302,19 @@  struct MemoryListener {
 };
 
 /**
+ * AddressSpaceOps: callbacks structure for address space specific operations
+ *
+ * @iommu_get: returns an IOMMU object that backs the address space.
+ *             Normally this should be NULL for generic address
+ *             spaces, and it's only used when there is one
+ *             translation unit behind this address space.
+ */
+struct AddressSpaceOps {
+    IOMMUObject *(*iommu_get)(AddressSpace *as);
+};
+typedef struct AddressSpaceOps AddressSpaceOps;
+
+/**
  * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
  */
 struct AddressSpace {
@@ -316,6 +330,7 @@  struct AddressSpace {
     struct MemoryRegionIoeventfd *ioeventfds;
     QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
     QTAILQ_ENTRY(AddressSpace) address_spaces_link;
+    AddressSpaceOps as_ops;
 };
 
 FlatView *address_space_to_flatview(AddressSpace *as);
@@ -1988,6 +2003,13 @@  address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
     address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
 }
 
+/**
+ * address_space_iommu_get: Get the backend IOMMU for the address space
+ *
+ * @as: the address space to fetch IOMMU from
+ */
+IOMMUObject *address_space_iommu_get(AddressSpace *as);
+
 #endif
 
 #endif
diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h
new file mode 100644
index 0000000..34387c0
--- /dev/null
+++ b/include/hw/core/iommu.h
@@ -0,0 +1,73 @@ 
+/*
+ * QEMU emulation of IOMMU logic
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * Authors: Peter Xu <peterx@redhat.com>,
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_CORE_IOMMU_H
+#define HW_CORE_IOMMU_H
+
+#include "qemu/queue.h"
+
+enum IOMMUEvent {
+    IOMMU_EVENT_BIND_PASIDT,
+};
+typedef enum IOMMUEvent IOMMUEvent;
+
+struct IOMMUEventData {
+    IOMMUEvent event;
+    uint64_t length;
+    void *data;
+};
+typedef struct IOMMUEventData IOMMUEventData;
+
+typedef struct IOMMUNotifier IOMMUNotifier;
+
+typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier,
+                              IOMMUEventData *event_data);
+
+struct IOMMUNotifier {
+    IOMMUNotifyFn iommu_notify;
+    /*
+     * What events we are listening to. Let's allow multiple event
+     * registrations from beginning.
+     */
+    IOMMUEvent event;
+    QLIST_ENTRY(IOMMUNotifier) node;
+};
+
+typedef struct IOMMUObject IOMMUObject;
+
+/*
+ * This stands for an IOMMU unit. Any translation device should have
+ * this struct inside its own structure to make sure it can leverage
+ * common IOMMU functionalities.
+ */
+struct IOMMUObject {
+    QLIST_HEAD(, IOMMUNotifier) iommu_notifiers;
+};
+
+void iommu_notifier_register(IOMMUObject *iommu,
+                             IOMMUNotifier *n,
+                             IOMMUNotifyFn fn,
+                             IOMMUEvent event);
+void iommu_notifier_unregister(IOMMUObject *iommu,
+                               IOMMUNotifier *notifier);
+void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data);
+
+#endif
diff --git a/memory.c b/memory.c
index 77fb3ef..307f665 100644
--- a/memory.c
+++ b/memory.c
@@ -235,8 +235,6 @@  struct FlatView {
     MemoryRegion *root;
 };
 
-typedef struct AddressSpaceOps AddressSpaceOps;
-
 #define FOR_EACH_FLAT_RANGE(var, view)          \
     for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
 
@@ -2793,6 +2791,14 @@  static void do_address_space_destroy(AddressSpace *as)
     memory_region_unref(as->root);
 }
 
+IOMMUObject *address_space_iommu_get(AddressSpace *as)
+{
+    if (!as->as_ops.iommu_get) {
+        return NULL;
+    }
+    return as->as_ops.iommu_get(as);
+}
+
 void address_space_destroy(AddressSpace *as)
 {
     MemoryRegion *root = as->root;