Message ID | 1509710516-21084-3-git-send-email-yi.l.liu@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > From: Peter Xu <peterx@redhat.com> > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > spaces to store arch-specific hooks. > > The first hook I would like to introduce is iommu_get(). Return an > IOMMUObject behind the AddressSpace. > > For systems that have IOMMUs, we will create a special address > space per device which is different from system default address > space for it (please refer to pci_device_iommu_address_space()). > Normally when that happens, there will be one specific IOMMU (or > say, translation unit) stands right behind that new address space. > > This iommu_get() fetches that guy behind the address space. Here, > the guy is defined as IOMMUObject, which includes a notifier_list > so far, may extend in future. Along with IOMMUObject, a new iommu > notifier mechanism is introduced. It would be used for virt-svm. > Also IOMMUObject can further have a IOMMUObjectOps which is similar > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > on MemoryRegion. > > Signed-off-by: Peter Xu <peterx@redhat.com> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> Hi, sorry I didn't reply to the earlier postings of this after our discussion in China. I've been sick several times and very busy. I still don't feel like there's an adequate explanation of exactly what an IOMMUObject represents. Obviously it can represent more than a single translation window - since that's represented by the IOMMUMR. But what exactly do all the MRs - or whatever else - that are represented by the IOMMUObject have in common, from a functional point of view. Even understanding the SVM stuff better than I did, I don't really see why an AddressSpace is an obvious unit to have an IOMMUObject associated with it. > --- > hw/core/Makefile.objs | 1 + > hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > include/exec/memory.h | 22 +++++++++++++++ > include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > memory.c | 10 +++++-- > 5 files changed, 162 insertions(+), 2 deletions(-) > create mode 100644 hw/core/iommu.c > create mode 100644 include/hw/core/iommu.h > > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > index f8d7a4a..d688412 100644 > --- a/hw/core/Makefile.objs > +++ b/hw/core/Makefile.objs > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > # irq.o needed for qdev GPIO handling: > common-obj-y += irq.o > common-obj-y += hotplug.o > +common-obj-y += iommu.o > common-obj-y += nmi.o > > common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > diff --git a/hw/core/iommu.c b/hw/core/iommu.c > new file mode 100644 > index 0000000..7c4fcfe > --- /dev/null > +++ b/hw/core/iommu.c > @@ -0,0 +1,58 @@ > +/* > + * QEMU emulation of IOMMU logic > + * > + * Copyright (C) 2017 Red Hat Inc. > + * > + * Authors: Peter Xu <peterx@redhat.com>, > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#include "qemu/osdep.h" > +#include "hw/core/iommu.h" > + > +void iommu_notifier_register(IOMMUObject *iommu, > + IOMMUNotifier *n, > + IOMMUNotifyFn fn, > + IOMMUEvent event) > +{ > + n->event = event; > + n->iommu_notify = fn; > + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > + return; > +} > + > +void iommu_notifier_unregister(IOMMUObject *iommu, > + IOMMUNotifier *notifier) > +{ > + IOMMUNotifier *cur, *next; > + > + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > + if (cur == notifier) { > + QLIST_REMOVE(cur, node); > + break; > + } > + } > +} > + > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > +{ > + IOMMUNotifier *cur; > + > + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > + if ((cur->event == event_data->event) && cur->iommu_notify) { > + cur->iommu_notify(cur, event_data); > + } > + } > +} > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 03595e3..8350973 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -26,6 +26,7 @@ > #include "qom/object.h" > #include "qemu/rcu.h" > #include "hw/qdev-core.h" > +#include "hw/core/iommu.h" > > #define RAM_ADDR_INVALID (~(ram_addr_t)0) > > @@ -301,6 +302,19 @@ struct MemoryListener { > }; > > /** > + * AddressSpaceOps: callbacks structure for address space specific operations > + * > + * @iommu_get: returns an IOMMU object that backs the address space. > + * Normally this should be NULL for generic address > + * spaces, and it's only used when there is one > + * translation unit behind this address space. > + */ > +struct AddressSpaceOps { > + IOMMUObject *(*iommu_get)(AddressSpace *as); > +}; > +typedef struct AddressSpaceOps AddressSpaceOps; > + > +/** > * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > */ > struct AddressSpace { > @@ -316,6 +330,7 @@ struct AddressSpace { > struct MemoryRegionIoeventfd *ioeventfds; > QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > + AddressSpaceOps as_ops; > }; > > FlatView *address_space_to_flatview(AddressSpace *as); > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > } > > +/** > + * address_space_iommu_get: Get the backend IOMMU for the address space > + * > + * @as: the address space to fetch IOMMU from > + */ > +IOMMUObject *address_space_iommu_get(AddressSpace *as); > + > #endif > > #endif > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > new file mode 100644 > index 0000000..34387c0 > --- /dev/null > +++ b/include/hw/core/iommu.h > @@ -0,0 +1,73 @@ > +/* > + * QEMU emulation of IOMMU logic > + * > + * Copyright (C) 2017 Red Hat Inc. > + * > + * Authors: Peter Xu <peterx@redhat.com>, > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#ifndef HW_CORE_IOMMU_H > +#define HW_CORE_IOMMU_H > + > +#include "qemu/queue.h" > + > +enum IOMMUEvent { > + IOMMU_EVENT_BIND_PASIDT, > +}; > +typedef enum IOMMUEvent IOMMUEvent; > + > +struct IOMMUEventData { > + IOMMUEvent event; > + uint64_t length; > + void *data; > +}; > +typedef struct IOMMUEventData IOMMUEventData; > + > +typedef struct IOMMUNotifier IOMMUNotifier; > + > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > + IOMMUEventData *event_data); > + > +struct IOMMUNotifier { > + IOMMUNotifyFn iommu_notify; > + /* > + * What events we are listening to. Let's allow multiple event > + * registrations from beginning. > + */ > + IOMMUEvent event; > + QLIST_ENTRY(IOMMUNotifier) node; > +}; > + > +typedef struct IOMMUObject IOMMUObject; > + > +/* > + * This stands for an IOMMU unit. Any translation device should have > + * this struct inside its own structure to make sure it can leverage > + * common IOMMU functionalities. > + */ > +struct IOMMUObject { > + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > +}; > + > +void iommu_notifier_register(IOMMUObject *iommu, > + IOMMUNotifier *n, > + IOMMUNotifyFn fn, > + IOMMUEvent event); > +void iommu_notifier_unregister(IOMMUObject *iommu, > + IOMMUNotifier *notifier); > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > + > +#endif > diff --git a/memory.c b/memory.c > index 77fb3ef..307f665 100644 > --- a/memory.c > +++ b/memory.c > @@ -235,8 +235,6 @@ struct FlatView { > MemoryRegion *root; > }; > > -typedef struct AddressSpaceOps AddressSpaceOps; > - > #define FOR_EACH_FLAT_RANGE(var, view) \ > for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > memory_region_unref(as->root); > } > > +IOMMUObject *address_space_iommu_get(AddressSpace *as) > +{ > + if (!as->as_ops.iommu_get) { > + return NULL; > + } > + return as->as_ops.iommu_get(as); > +} > + > void address_space_destroy(AddressSpace *as) > { > MemoryRegion *root = as->root;
On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > From: Peter Xu <peterx@redhat.com> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > spaces to store arch-specific hooks. > > > > The first hook I would like to introduce is iommu_get(). Return an > > IOMMUObject behind the AddressSpace. > > > > For systems that have IOMMUs, we will create a special address > > space per device which is different from system default address > > space for it (please refer to pci_device_iommu_address_space()). > > Normally when that happens, there will be one specific IOMMU (or > > say, translation unit) stands right behind that new address space. > > > > This iommu_get() fetches that guy behind the address space. Here, > > the guy is defined as IOMMUObject, which includes a notifier_list > > so far, may extend in future. Along with IOMMUObject, a new iommu > > notifier mechanism is introduced. It would be used for virt-svm. > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > on MemoryRegion. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > Hi, sorry I didn't reply to the earlier postings of this after our > discussion in China. I've been sick several times and very busy. > > I still don't feel like there's an adequate explanation of exactly > what an IOMMUObject represents. Obviously it can represent more than > a single translation window - since that's represented by the > IOMMUMR. But what exactly do all the MRs - or whatever else - that > are represented by the IOMMUObject have in common, from a functional > point of view. > > Even understanding the SVM stuff better than I did, I don't really see > why an AddressSpace is an obvious unit to have an IOMMUObject > associated with it. Here's what I thought about it: IOMMUObject was planned to be the abstraction of the hardware translation unit, which is a higher level of the translated address spaces. Say, for each PCI device, it can have its own translated address space. However for multiple PCI devices, they can be sharing the same translation unit that handles the translation requests from different devices. That's the case for Intel platforms. We introduced this IOMMUObject because sometimes we want to do something with that translation unit rather than a specific device, in which we need a general IOMMU device handle. IIRC one issue left over during last time's discussion was that there could be more complicated IOMMU models. E.g., one device's DMA request can be translated nestedly by two or multiple IOMMUs, and current proposal cannot really handle that complicated hierachy. I'm just thinking whether we can start from a simple model (say, we don't allow nested IOMMUs, and actually we don't even allow multiple IOMMUs so far), then we can evolve from that point in the future. Also, I thought there were something you mentioned that this approach is not correct for Power systems, but I can't really remember the details... Anyways, I think this is not the only approach to solve the problem, and I believe any new better idea would be greatly welcomed as well. :) Thanks, > > > --- > > hw/core/Makefile.objs | 1 + > > hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > > include/exec/memory.h | 22 +++++++++++++++ > > include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > > memory.c | 10 +++++-- > > 5 files changed, 162 insertions(+), 2 deletions(-) > > create mode 100644 hw/core/iommu.c > > create mode 100644 include/hw/core/iommu.h > > > > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > > index f8d7a4a..d688412 100644 > > --- a/hw/core/Makefile.objs > > +++ b/hw/core/Makefile.objs > > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > > # irq.o needed for qdev GPIO handling: > > common-obj-y += irq.o > > common-obj-y += hotplug.o > > +common-obj-y += iommu.o > > common-obj-y += nmi.o > > > > common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > > diff --git a/hw/core/iommu.c b/hw/core/iommu.c > > new file mode 100644 > > index 0000000..7c4fcfe > > --- /dev/null > > +++ b/hw/core/iommu.c > > @@ -0,0 +1,58 @@ > > +/* > > + * QEMU emulation of IOMMU logic > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#include "qemu/osdep.h" > > +#include "hw/core/iommu.h" > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event) > > +{ > > + n->event = event; > > + n->iommu_notify = fn; > > + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > > + return; > > +} > > + > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier) > > +{ > > + IOMMUNotifier *cur, *next; > > + > > + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > > + if (cur == notifier) { > > + QLIST_REMOVE(cur, node); > > + break; > > + } > > + } > > +} > > + > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > > +{ > > + IOMMUNotifier *cur; > > + > > + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > > + if ((cur->event == event_data->event) && cur->iommu_notify) { > > + cur->iommu_notify(cur, event_data); > > + } > > + } > > +} > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > index 03595e3..8350973 100644 > > --- a/include/exec/memory.h > > +++ b/include/exec/memory.h > > @@ -26,6 +26,7 @@ > > #include "qom/object.h" > > #include "qemu/rcu.h" > > #include "hw/qdev-core.h" > > +#include "hw/core/iommu.h" > > > > #define RAM_ADDR_INVALID (~(ram_addr_t)0) > > > > @@ -301,6 +302,19 @@ struct MemoryListener { > > }; > > > > /** > > + * AddressSpaceOps: callbacks structure for address space specific operations > > + * > > + * @iommu_get: returns an IOMMU object that backs the address space. > > + * Normally this should be NULL for generic address > > + * spaces, and it's only used when there is one > > + * translation unit behind this address space. > > + */ > > +struct AddressSpaceOps { > > + IOMMUObject *(*iommu_get)(AddressSpace *as); > > +}; > > +typedef struct AddressSpaceOps AddressSpaceOps; > > + > > +/** > > * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > > */ > > struct AddressSpace { > > @@ -316,6 +330,7 @@ struct AddressSpace { > > struct MemoryRegionIoeventfd *ioeventfds; > > QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > > + AddressSpaceOps as_ops; > > }; > > > > FlatView *address_space_to_flatview(AddressSpace *as); > > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > > address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > > } > > > > +/** > > + * address_space_iommu_get: Get the backend IOMMU for the address space > > + * > > + * @as: the address space to fetch IOMMU from > > + */ > > +IOMMUObject *address_space_iommu_get(AddressSpace *as); > > + > > #endif > > > > #endif > > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > > new file mode 100644 > > index 0000000..34387c0 > > --- /dev/null > > +++ b/include/hw/core/iommu.h > > @@ -0,0 +1,73 @@ > > +/* > > + * QEMU emulation of IOMMU logic > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#ifndef HW_CORE_IOMMU_H > > +#define HW_CORE_IOMMU_H > > + > > +#include "qemu/queue.h" > > + > > +enum IOMMUEvent { > > + IOMMU_EVENT_BIND_PASIDT, > > +}; > > +typedef enum IOMMUEvent IOMMUEvent; > > + > > +struct IOMMUEventData { > > + IOMMUEvent event; > > + uint64_t length; > > + void *data; > > +}; > > +typedef struct IOMMUEventData IOMMUEventData; > > + > > +typedef struct IOMMUNotifier IOMMUNotifier; > > + > > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > > + IOMMUEventData *event_data); > > + > > +struct IOMMUNotifier { > > + IOMMUNotifyFn iommu_notify; > > + /* > > + * What events we are listening to. Let's allow multiple event > > + * registrations from beginning. > > + */ > > + IOMMUEvent event; > > + QLIST_ENTRY(IOMMUNotifier) node; > > +}; > > + > > +typedef struct IOMMUObject IOMMUObject; > > + > > +/* > > + * This stands for an IOMMU unit. Any translation device should have > > + * this struct inside its own structure to make sure it can leverage > > + * common IOMMU functionalities. > > + */ > > +struct IOMMUObject { > > + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > > +}; > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event); > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier); > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > > + > > +#endif > > diff --git a/memory.c b/memory.c > > index 77fb3ef..307f665 100644 > > --- a/memory.c > > +++ b/memory.c > > @@ -235,8 +235,6 @@ struct FlatView { > > MemoryRegion *root; > > }; > > > > -typedef struct AddressSpaceOps AddressSpaceOps; > > - > > #define FOR_EACH_FLAT_RANGE(var, view) \ > > for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > > > > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > > memory_region_unref(as->root); > > } > > > > +IOMMUObject *address_space_iommu_get(AddressSpace *as) > > +{ > > + if (!as->as_ops.iommu_get) { > > + return NULL; > > + } > > + return as->as_ops.iommu_get(as); > > +} > > + > > void address_space_destroy(AddressSpace *as) > > { > > MemoryRegion *root = as->root; > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > From: Peter Xu <peterx@redhat.com> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > spaces to store arch-specific hooks. > > > > The first hook I would like to introduce is iommu_get(). Return an > > IOMMUObject behind the AddressSpace. > > > > For systems that have IOMMUs, we will create a special address > > space per device which is different from system default address > > space for it (please refer to pci_device_iommu_address_space()). > > Normally when that happens, there will be one specific IOMMU (or > > say, translation unit) stands right behind that new address space. > > > > This iommu_get() fetches that guy behind the address space. Here, > > the guy is defined as IOMMUObject, which includes a notifier_list > > so far, may extend in future. Along with IOMMUObject, a new iommu > > notifier mechanism is introduced. It would be used for virt-svm. > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > on MemoryRegion. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > Hi, sorry I didn't reply to the earlier postings of this after our > discussion in China. I've been sick several times and very busy. Hi David, Fully understood. I'll try my best to address your question. Also, feel free to input further questions, anyhow, the more we discuss the better work we done. > I still don't feel like there's an adequate explanation of exactly > what an IOMMUObject represents. Obviously it can represent more than IOMMUObject is aimed to represent the iommu itself. e.g. the iommu specific operations. One of the key purpose of IOMMUObject is to introduce a notifier framework to let iommu emulator to be able to do iommu operations other than MAP/UNMAP. As IOMMU grows more and more feature, MAP/UNMAP is not the only operation iommu emulator needs to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also has it. may correct me on it. As my cover letter mentioned, MR based notifier framework doesn’t work for the newly added IOMMU operations. Like bind guest pasid table pointer to host and propagate guest's iotlb flush to host. > a single translation window - since that's represented by the > IOMMUMR. But what exactly do all the MRs - or whatever else - that > are represented by the IOMMUObject have in common, from a functional > point of view. Let me take virt-SVM as an example. As far as I know, for virt-SVM, the implementation of different vendors are similar. The key design is to have a nested translation(aka. two stage translation). It is to have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA mapping. Similar to EPT based virt-MMU solution. In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor needs to trap specific guest iommu operation and pass the gVA->gPA mapping knowledge to host through a notifier(newly added one). In VT-d, it is called bind guest pasid table to host. Also, for the gVA iotlb flushing, only guest knows it. So hypervisor needs to propagate it to host. Here, MAP/UNMAP is not suitable since this gVA iotlb flush here doesn’t require to modify host iommu translation table. At the time gVA iotlb flush is issued, the gVA->gPA mapping has already modified. Host iommu only needs to reference it when performing address translation. But before host iommu perform translation, it needs to flush the old gVA cache. In VT-d, it is called 1st level cache flushing. Both of the two notifiers(operations) has no direct relationship with MR, instead they highly depends on virt-iommu itself. If virt-iommu exists, then the two notfiers are needed, if not, then it's not. > Even understanding the SVM stuff better than I did, I don't really see > why an AddressSpace is an obvious unit to have an IOMMUObject > associated with it. This will benefit the notifier registration. As my comments above, the IOMMUObject is to represent iommu. Associate an AddressSpace with an IOMMUObject makes it easy to check if it is necessary to register the notifiers. If no IOMMUObject, means no virt-iommu exposed to guest, then no need to register notifiers. For this, I also considered to use the MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the subregions and register notfiers if it is an iommu MemoryRegion. Peter mentioned it may not work for SPAR. So he proposed associating an AddressSpace with an IOMMUObject. I think it wroks and easier, so I didn’t object it. https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html + /* Check if vIOMMU exists */ + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { + if (memory_region_is_iommu(subregion)) { + IOMMUNotifier n1; + + /* + FIXME: current iommu notifier is actually designed for + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need + notifiers other than MAP/UNMAP, so it'll be better to + split the non-IOMMUTLB notifier from the current IOMMUTLB + notifier framewrok. + */ + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, + IOMMU_NOTIFIER_SVM_PASIDT_BIND, + 0, + 0); + vfio_register_notifier(group->container, + subregion, + 0, + &n1); + } + } + Thanks, Yi L > > --- > > hw/core/Makefile.objs | 1 + > > hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > > include/exec/memory.h | 22 +++++++++++++++ > > include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > > memory.c | 10 +++++-- > > 5 files changed, 162 insertions(+), 2 deletions(-) > > create mode 100644 hw/core/iommu.c > > create mode 100644 include/hw/core/iommu.h > > > > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > > index f8d7a4a..d688412 100644 > > --- a/hw/core/Makefile.objs > > +++ b/hw/core/Makefile.objs > > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > > # irq.o needed for qdev GPIO handling: > > common-obj-y += irq.o > > common-obj-y += hotplug.o > > +common-obj-y += iommu.o > > common-obj-y += nmi.o > > > > common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > > diff --git a/hw/core/iommu.c b/hw/core/iommu.c > > new file mode 100644 > > index 0000000..7c4fcfe > > --- /dev/null > > +++ b/hw/core/iommu.c > > @@ -0,0 +1,58 @@ > > +/* > > + * QEMU emulation of IOMMU logic > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#include "qemu/osdep.h" > > +#include "hw/core/iommu.h" > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event) > > +{ > > + n->event = event; > > + n->iommu_notify = fn; > > + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > > + return; > > +} > > + > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier) > > +{ > > + IOMMUNotifier *cur, *next; > > + > > + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > > + if (cur == notifier) { > > + QLIST_REMOVE(cur, node); > > + break; > > + } > > + } > > +} > > + > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > > +{ > > + IOMMUNotifier *cur; > > + > > + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > > + if ((cur->event == event_data->event) && cur->iommu_notify) { > > + cur->iommu_notify(cur, event_data); > > + } > > + } > > +} > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > index 03595e3..8350973 100644 > > --- a/include/exec/memory.h > > +++ b/include/exec/memory.h > > @@ -26,6 +26,7 @@ > > #include "qom/object.h" > > #include "qemu/rcu.h" > > #include "hw/qdev-core.h" > > +#include "hw/core/iommu.h" > > > > #define RAM_ADDR_INVALID (~(ram_addr_t)0) > > > > @@ -301,6 +302,19 @@ struct MemoryListener { > > }; > > > > /** > > + * AddressSpaceOps: callbacks structure for address space specific operations > > + * > > + * @iommu_get: returns an IOMMU object that backs the address space. > > + * Normally this should be NULL for generic address > > + * spaces, and it's only used when there is one > > + * translation unit behind this address space. > > + */ > > +struct AddressSpaceOps { > > + IOMMUObject *(*iommu_get)(AddressSpace *as); > > +}; > > +typedef struct AddressSpaceOps AddressSpaceOps; > > + > > +/** > > * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > > */ > > struct AddressSpace { > > @@ -316,6 +330,7 @@ struct AddressSpace { > > struct MemoryRegionIoeventfd *ioeventfds; > > QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > > + AddressSpaceOps as_ops; > > }; > > > > FlatView *address_space_to_flatview(AddressSpace *as); > > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > > address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > > } > > > > +/** > > + * address_space_iommu_get: Get the backend IOMMU for the address space > > + * > > + * @as: the address space to fetch IOMMU from > > + */ > > +IOMMUObject *address_space_iommu_get(AddressSpace *as); > > + > > #endif > > > > #endif > > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > > new file mode 100644 > > index 0000000..34387c0 > > --- /dev/null > > +++ b/include/hw/core/iommu.h > > @@ -0,0 +1,73 @@ > > +/* > > + * QEMU emulation of IOMMU logic > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#ifndef HW_CORE_IOMMU_H > > +#define HW_CORE_IOMMU_H > > + > > +#include "qemu/queue.h" > > + > > +enum IOMMUEvent { > > + IOMMU_EVENT_BIND_PASIDT, > > +}; > > +typedef enum IOMMUEvent IOMMUEvent; > > + > > +struct IOMMUEventData { > > + IOMMUEvent event; > > + uint64_t length; > > + void *data; > > +}; > > +typedef struct IOMMUEventData IOMMUEventData; > > + > > +typedef struct IOMMUNotifier IOMMUNotifier; > > + > > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > > + IOMMUEventData *event_data); > > + > > +struct IOMMUNotifier { > > + IOMMUNotifyFn iommu_notify; > > + /* > > + * What events we are listening to. Let's allow multiple event > > + * registrations from beginning. > > + */ > > + IOMMUEvent event; > > + QLIST_ENTRY(IOMMUNotifier) node; > > +}; > > + > > +typedef struct IOMMUObject IOMMUObject; > > + > > +/* > > + * This stands for an IOMMU unit. Any translation device should have > > + * this struct inside its own structure to make sure it can leverage > > + * common IOMMU functionalities. > > + */ > > +struct IOMMUObject { > > + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > > +}; > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event); > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier); > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > > + > > +#endif > > diff --git a/memory.c b/memory.c > > index 77fb3ef..307f665 100644 > > --- a/memory.c > > +++ b/memory.c > > @@ -235,8 +235,6 @@ struct FlatView { > > MemoryRegion *root; > > }; > > > > -typedef struct AddressSpaceOps AddressSpaceOps; > > - > > #define FOR_EACH_FLAT_RANGE(var, view) \ > > for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > > > > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > > memory_region_unref(as->root); > > } > > > > +IOMMUObject *address_space_iommu_get(AddressSpace *as) > > +{ > > + if (!as->as_ops.iommu_get) { > > + return NULL; > > + } > > + return as->as_ops.iommu_get(as); > > +} > > + > > void address_space_destroy(AddressSpace *as) > > { > > MemoryRegion *root = as->root; > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > From: Peter Xu <peterx@redhat.com> > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > spaces to store arch-specific hooks. > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > IOMMUObject behind the AddressSpace. > > > > > > For systems that have IOMMUs, we will create a special address > > > space per device which is different from system default address > > > space for it (please refer to pci_device_iommu_address_space()). > > > Normally when that happens, there will be one specific IOMMU (or > > > say, translation unit) stands right behind that new address space. > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > notifier mechanism is introduced. It would be used for virt-svm. > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > on MemoryRegion. > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > discussion in China. I've been sick several times and very busy. > > > > I still don't feel like there's an adequate explanation of exactly > > what an IOMMUObject represents. Obviously it can represent more than > > a single translation window - since that's represented by the > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > are represented by the IOMMUObject have in common, from a functional > > point of view. > > > > Even understanding the SVM stuff better than I did, I don't really see > > why an AddressSpace is an obvious unit to have an IOMMUObject > > associated with it. > > Here's what I thought about it: IOMMUObject was planned to be the > abstraction of the hardware translation unit, which is a higher level > of the translated address spaces. Say, for each PCI device, it can > have its own translated address space. However for multiple PCI > devices, they can be sharing the same translation unit that handles > the translation requests from different devices. That's the case for > Intel platforms. We introduced this IOMMUObject because sometimes we > want to do something with that translation unit rather than a specific > device, in which we need a general IOMMU device handle. Ok, but what does "hardware translation unit" mean in practice. The guest neither knows nor cares, which bits of IOMMU translation happen to be included in the same bundle of silicon. It only cares what the behaviour is. What behavioural characteristics does a single IOMMUObject have? > IIRC one issue left over during last time's discussion was that there > could be more complicated IOMMU models. E.g., one device's DMA request > can be translated nestedly by two or multiple IOMMUs, and current > proposal cannot really handle that complicated hierachy. I'm just > thinking whether we can start from a simple model (say, we don't allow > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > far), then we can evolve from that point in the future. > > Also, I thought there were something you mentioned that this approach > is not correct for Power systems, but I can't really remember the > details... Anyways, I think this is not the only approach to solve > the problem, and I believe any new better idea would be greatly > welcomed as well. :) So, some of my initial comments were based on a misunderstanding of what was proposed here - since discussing this with Yi at LinuxCon Beijing, I have a better idea of what's going on. On POWER - or rather the "pseries" platform, which is paravirtualized. We can have multiple vIOMMU windows (usually 2) for a single virtual PCI host bridge. Because of the paravirtualization, the mapping to hardware is fuzzy, but for passthrough devices they will both be implemented by the IOMMU built into the physical host bridge. That isn't importat to the guest, though - all operations happen at the window level. The other thing that bothers me here is the way it's attached to an AddressSpace. IIUC how SVM works, the whole point is that the device no longer writes into a specific PCI address space. Instead, it writes directly into a process address space. So it seems to me more that SVM should operate at the PCI level, and disassociate the device from the normal PCI address space entirely, rather than hooking up something via that address space.
On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > spaces to store arch-specific hooks. > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > IOMMUObject behind the AddressSpace. > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > space per device which is different from system default address > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > Normally when that happens, there will be one specific IOMMU (or > > > > say, translation unit) stands right behind that new address space. > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > on MemoryRegion. > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > discussion in China. I've been sick several times and very busy. > > > > > > I still don't feel like there's an adequate explanation of exactly > > > what an IOMMUObject represents. Obviously it can represent more than > > > a single translation window - since that's represented by the > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > are represented by the IOMMUObject have in common, from a functional > > > point of view. > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > associated with it. > > > > Here's what I thought about it: IOMMUObject was planned to be the > > abstraction of the hardware translation unit, which is a higher level > > of the translated address spaces. Say, for each PCI device, it can > > have its own translated address space. However for multiple PCI > > devices, they can be sharing the same translation unit that handles > > the translation requests from different devices. That's the case for > > Intel platforms. We introduced this IOMMUObject because sometimes we > > want to do something with that translation unit rather than a specific > > device, in which we need a general IOMMU device handle. > > Ok, but what does "hardware translation unit" mean in practice. The > guest neither knows nor cares, which bits of IOMMU translation happen > to be included in the same bundle of silicon. It only cares what the > behaviour is. What behavioural characteristics does a single > IOMMUObject have? In VT-d (I believe the same to ARM SMMUs), IMHO the special thing is that the translation windows (and device address spaces in QEMU) are only talking about second level translations, but not first level, while virt-svm needs to play with first level translations. Until now, AFAIU we don't really have a good interface for first level translations at all (aka. the process address space). > > > IIRC one issue left over during last time's discussion was that there > > could be more complicated IOMMU models. E.g., one device's DMA request > > can be translated nestedly by two or multiple IOMMUs, and current > > proposal cannot really handle that complicated hierachy. I'm just > > thinking whether we can start from a simple model (say, we don't allow > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > far), then we can evolve from that point in the future. > > > > Also, I thought there were something you mentioned that this approach > > is not correct for Power systems, but I can't really remember the > > details... Anyways, I think this is not the only approach to solve > > the problem, and I believe any new better idea would be greatly > > welcomed as well. :) > > So, some of my initial comments were based on a misunderstanding of > what was proposed here - since discussing this with Yi at LinuxCon > Beijing, I have a better idea of what's going on. > > On POWER - or rather the "pseries" platform, which is paravirtualized. > We can have multiple vIOMMU windows (usually 2) for a single virtual > PCI host bridge. Because of the paravirtualization, the mapping to > hardware is fuzzy, but for passthrough devices they will both be > implemented by the IOMMU built into the physical host bridge. That > isn't importat to the guest, though - all operations happen at the > window level. Now I know that for Power it may not have anything like a "translation unit" but everything is defined as "translation windows" in the guests. However the problem still exist for some other platforms. Say, for Intel we have emulated VT-d; for ARM, we have vSMMU. AFAIU these platforms do have their translation units, and even for ARM it should need such an interface (or any better interfaces, which are always welcomed) for virt-svm to work. Otherwise I don't know a way to configure the first level translation tables. Meanwhile, IMO this abstraction should not really affect pseries - it should be only useful for those platforms who would like to use it. For pseries, we can just ignore that new interface if we don't really even have such a translation unit. > > The other thing that bothers me here is the way it's attached to an > AddressSpace. IIUC how SVM works, the whole point is that the device > no longer writes into a specific PCI address space. Instead, it > writes directly into a process address space. So it seems to me more > that SVM should operate at the PCI level, and disassociate the device > from the normal PCI address space entirely, rather than hooking up > something via that address space. IMO the PCI address space is still used. For virt-svm, host IOMMU will be working in nested translation mode, so we should be having two mappings working in parallel: 1. DPDK process (in guest) address space mapping (GVA -> GPA) 2. guest direct memory mappings (GPA -> HPA) And here AFAIU the 2nd mapping is working exactly like general PCI devices, the only difference is that the 2nd level mapping is always static, just like when IOMMU passthrough is enabled for that device. So, IMHO virt-SVM is not really in parallel with PCI subsystem. For the SVM in guest, it may be different, since it should only be using first level translations. However to implement virt-SVM, IMHO we not only need existing PCI address space translation logic, we also need an extra way to configure the first level mappings, as discussed. Thanks, > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
Hi Yi L, On 13/11/2017 10:58, Liu, Yi L wrote: > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: >> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: >>> From: Peter Xu <peterx@redhat.com> >>> >>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address >>> spaces to store arch-specific hooks. >>> >>> The first hook I would like to introduce is iommu_get(). Return an >>> IOMMUObject behind the AddressSpace. >>> >>> For systems that have IOMMUs, we will create a special address >>> space per device which is different from system default address >>> space for it (please refer to pci_device_iommu_address_space()). >>> Normally when that happens, there will be one specific IOMMU (or >>> say, translation unit) stands right behind that new address space. >>> >>> This iommu_get() fetches that guy behind the address space. Here, >>> the guy is defined as IOMMUObject, which includes a notifier_list >>> so far, may extend in future. Along with IOMMUObject, a new iommu >>> notifier mechanism is introduced. It would be used for virt-svm. >>> Also IOMMUObject can further have a IOMMUObjectOps which is similar >>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied >>> on MemoryRegion. >>> >>> Signed-off-by: Peter Xu <peterx@redhat.com> >>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> >> >> Hi, sorry I didn't reply to the earlier postings of this after our >> discussion in China. I've been sick several times and very busy. > > Hi David, > > Fully understood. I'll try my best to address your question. Also, > feel free to input further questions, anyhow, the more we discuss the > better work we done. > >> I still don't feel like there's an adequate explanation of exactly >> what an IOMMUObject represents. Obviously it can represent more than > > IOMMUObject is aimed to represent the iommu itself. e.g. the iommu > specific operations. One of the key purpose of IOMMUObject is to > introduce a notifier framework to let iommu emulator to be able to > do iommu operations other than MAP/UNMAP. As IOMMU grows more and > more feature, MAP/UNMAP is not the only operation iommu emulator needs > to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also > has it. may correct me on it. As my cover letter mentioned, MR based > notifier framework doesn’t work for the newly added IOMMU operations. > Like bind guest pasid table pointer to host and propagate guest's > iotlb flush to host. > >> a single translation window - since that's represented by the >> IOMMUMR. But what exactly do all the MRs - or whatever else - that >> are represented by the IOMMUObject have in common, from a functional >> point of view. > > Let me take virt-SVM as an example. As far as I know, for virt-SVM, > the implementation of different vendors are similar. The key design > is to have a nested translation(aka. two stage translation). It is to > have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA > mapping. Similar to EPT based virt-MMU solution. > > In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can > keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor > needs to trap specific guest iommu operation and pass the gVA->gPA > mapping knowledge to host through a notifier(newly added one). In VT-d, > it is called bind guest pasid table to host. What I don't get is the PASID table is per extended context entry. I understand this latter is indexed by PCI device function. And today MR are created per PCIe device if I am not wrong. So why can't we have 1 new MR notifier dedicated to PASID table passing? My understanding is the MR, having a 1-1 correspondence with a PCIe device and thus a context could be of right granularity. Then I understand the only flags we currently have are NONE, MAP and UNMAP but couldn't we add a new one for PASID TABLE passing? So this is not crystal clear to me why MR notifiers are not adapted to PASID table passing. > > Also, for the gVA iotlb flushing, only guest knows it. So hypervisor > needs to propagate it to host. Here, MAP/UNMAP is not suitable since > this gVA iotlb flush here doesn’t require to modify host iommu > translation table. I don't really get this argument. IOMMUNotifier just is a notifier that is attached to an IOMMU MR and calls a an IOMMUNotify function, right? Then the role of the function currently is attached to the currently existing flags, MAP, UNMAP. This is not linked to an action on the physical IOMMU, right? At the time gVA iotlb flush is issued, the gVA->gPA > mapping has already modified. Host iommu only needs to reference it when > performing address translation. But before host iommu perform translation, > it needs to flush the old gVA cache. In VT-d, it is called 1st level cache > flushing. The fact MR notifiers may not be relevant could be linked to the nature of the tagging of the structures you want to flush. My understanding is if you want to flush by source-id, MR granularity could be fine. Please could you clarify why do you need an iommu wide operation in that case. > > Both of the two notifiers(operations) has no direct relationship with MR, > instead they highly depends on virt-iommu itself. As described above this is not obvious to me. Please could you clarify why source-id granularity (which I understand has a 1-1 granularity with MR/AS is not the correct granularity). Of course, please correct me if my understanding of MR mapping is not correct. Thanks Eric If virt-iommu exists, > then the two notfiers are needed, if not, then it's not. > >> Even understanding the SVM stuff better than I did, I don't really see >> why an AddressSpace is an obvious unit to have an IOMMUObject >> associated with it. > > This will benefit the notifier registration. As my comments above, the > IOMMUObject is to represent iommu. Associate an AddressSpace with an > IOMMUObject makes it easy to check if it is necessary to register the > notifiers. If no IOMMUObject, means no virt-iommu exposed to guest, then > no need to register notifiers. For this, I also considered to use the > MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the > subregions and register notfiers if it is an iommu MemoryRegion. Peter > mentioned it may not work for SPAR. So he proposed associating an > AddressSpace with an IOMMUObject. I think it wroks and easier, so I > didn’t object it. > > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html > > + /* Check if vIOMMU exists */ > + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { > + if (memory_region_is_iommu(subregion)) { > + IOMMUNotifier n1; > + > + /* > + FIXME: current iommu notifier is actually designed for > + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need > + notifiers other than MAP/UNMAP, so it'll be better to > + split the non-IOMMUTLB notifier from the current IOMMUTLB > + notifier framewrok. > + */ > + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, > + IOMMU_NOTIFIER_SVM_PASIDT_BIND, > + 0, > + 0); > + vfio_register_notifier(group->container, > + subregion, > + 0, > + &n1); > + } > + } > + > > Thanks, > Yi L > >>> --- >>> hw/core/Makefile.objs | 1 + >>> hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ >>> include/exec/memory.h | 22 +++++++++++++++ >>> include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ >>> memory.c | 10 +++++-- >>> 5 files changed, 162 insertions(+), 2 deletions(-) >>> create mode 100644 hw/core/iommu.c >>> create mode 100644 include/hw/core/iommu.h >>> >>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs >>> index f8d7a4a..d688412 100644 >>> --- a/hw/core/Makefile.objs >>> +++ b/hw/core/Makefile.objs >>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o >>> # irq.o needed for qdev GPIO handling: >>> common-obj-y += irq.o >>> common-obj-y += hotplug.o >>> +common-obj-y += iommu.o >>> common-obj-y += nmi.o >>> >>> common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o >>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c >>> new file mode 100644 >>> index 0000000..7c4fcfe >>> --- /dev/null >>> +++ b/hw/core/iommu.c >>> @@ -0,0 +1,58 @@ >>> +/* >>> + * QEMU emulation of IOMMU logic >>> + * >>> + * Copyright (C) 2017 Red Hat Inc. >>> + * >>> + * Authors: Peter Xu <peterx@redhat.com>, >>> + * >>> + * This program is free software; you can redistribute it and/or modify >>> + * it under the terms of the GNU General Public License as published by >>> + * the Free Software Foundation; either version 2 of the License, or >>> + * (at your option) any later version. >>> + >>> + * This program is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> + * GNU General Public License for more details. >>> + >>> + * You should have received a copy of the GNU General Public License along >>> + * with this program; if not, see <http://www.gnu.org/licenses/>. >>> + */ >>> + >>> +#include "qemu/osdep.h" >>> +#include "hw/core/iommu.h" >>> + >>> +void iommu_notifier_register(IOMMUObject *iommu, >>> + IOMMUNotifier *n, >>> + IOMMUNotifyFn fn, >>> + IOMMUEvent event) >>> +{ >>> + n->event = event; >>> + n->iommu_notify = fn; >>> + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); >>> + return; >>> +} >>> + >>> +void iommu_notifier_unregister(IOMMUObject *iommu, >>> + IOMMUNotifier *notifier) >>> +{ >>> + IOMMUNotifier *cur, *next; >>> + >>> + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { >>> + if (cur == notifier) { >>> + QLIST_REMOVE(cur, node); >>> + break; >>> + } >>> + } >>> +} >>> + >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) >>> +{ >>> + IOMMUNotifier *cur; >>> + >>> + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { >>> + if ((cur->event == event_data->event) && cur->iommu_notify) { >>> + cur->iommu_notify(cur, event_data); >>> + } >>> + } >>> +} >>> diff --git a/include/exec/memory.h b/include/exec/memory.h >>> index 03595e3..8350973 100644 >>> --- a/include/exec/memory.h >>> +++ b/include/exec/memory.h >>> @@ -26,6 +26,7 @@ >>> #include "qom/object.h" >>> #include "qemu/rcu.h" >>> #include "hw/qdev-core.h" >>> +#include "hw/core/iommu.h" >>> >>> #define RAM_ADDR_INVALID (~(ram_addr_t)0) >>> >>> @@ -301,6 +302,19 @@ struct MemoryListener { >>> }; >>> >>> /** >>> + * AddressSpaceOps: callbacks structure for address space specific operations >>> + * >>> + * @iommu_get: returns an IOMMU object that backs the address space. >>> + * Normally this should be NULL for generic address >>> + * spaces, and it's only used when there is one >>> + * translation unit behind this address space. >>> + */ >>> +struct AddressSpaceOps { >>> + IOMMUObject *(*iommu_get)(AddressSpace *as); >>> +}; >>> +typedef struct AddressSpaceOps AddressSpaceOps; >>> + >>> +/** >>> * AddressSpace: describes a mapping of addresses to #MemoryRegion objects >>> */ >>> struct AddressSpace { >>> @@ -316,6 +330,7 @@ struct AddressSpace { >>> struct MemoryRegionIoeventfd *ioeventfds; >>> QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; >>> QTAILQ_ENTRY(AddressSpace) address_spaces_link; >>> + AddressSpaceOps as_ops; >>> }; >>> >>> FlatView *address_space_to_flatview(AddressSpace *as); >>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, >>> address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); >>> } >>> >>> +/** >>> + * address_space_iommu_get: Get the backend IOMMU for the address space >>> + * >>> + * @as: the address space to fetch IOMMU from >>> + */ >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as); >>> + >>> #endif >>> >>> #endif >>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h >>> new file mode 100644 >>> index 0000000..34387c0 >>> --- /dev/null >>> +++ b/include/hw/core/iommu.h >>> @@ -0,0 +1,73 @@ >>> +/* >>> + * QEMU emulation of IOMMU logic >>> + * >>> + * Copyright (C) 2017 Red Hat Inc. >>> + * >>> + * Authors: Peter Xu <peterx@redhat.com>, >>> + * >>> + * This program is free software; you can redistribute it and/or modify >>> + * it under the terms of the GNU General Public License as published by >>> + * the Free Software Foundation; either version 2 of the License, or >>> + * (at your option) any later version. >>> + >>> + * This program is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> + * GNU General Public License for more details. >>> + >>> + * You should have received a copy of the GNU General Public License along >>> + * with this program; if not, see <http://www.gnu.org/licenses/>. >>> + */ >>> + >>> +#ifndef HW_CORE_IOMMU_H >>> +#define HW_CORE_IOMMU_H >>> + >>> +#include "qemu/queue.h" >>> + >>> +enum IOMMUEvent { >>> + IOMMU_EVENT_BIND_PASIDT, >>> +}; >>> +typedef enum IOMMUEvent IOMMUEvent; >>> + >>> +struct IOMMUEventData { >>> + IOMMUEvent event; >>> + uint64_t length; >>> + void *data; >>> +}; >>> +typedef struct IOMMUEventData IOMMUEventData; >>> + >>> +typedef struct IOMMUNotifier IOMMUNotifier; >>> + >>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, >>> + IOMMUEventData *event_data); >>> + >>> +struct IOMMUNotifier { >>> + IOMMUNotifyFn iommu_notify; >>> + /* >>> + * What events we are listening to. Let's allow multiple event >>> + * registrations from beginning. >>> + */ >>> + IOMMUEvent event; >>> + QLIST_ENTRY(IOMMUNotifier) node; >>> +}; >>> + >>> +typedef struct IOMMUObject IOMMUObject; >>> + >>> +/* >>> + * This stands for an IOMMU unit. Any translation device should have >>> + * this struct inside its own structure to make sure it can leverage >>> + * common IOMMU functionalities. >>> + */ >>> +struct IOMMUObject { >>> + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; >>> +}; >>> + >>> +void iommu_notifier_register(IOMMUObject *iommu, >>> + IOMMUNotifier *n, >>> + IOMMUNotifyFn fn, >>> + IOMMUEvent event); >>> +void iommu_notifier_unregister(IOMMUObject *iommu, >>> + IOMMUNotifier *notifier); >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); >>> + >>> +#endif >>> diff --git a/memory.c b/memory.c >>> index 77fb3ef..307f665 100644 >>> --- a/memory.c >>> +++ b/memory.c >>> @@ -235,8 +235,6 @@ struct FlatView { >>> MemoryRegion *root; >>> }; >>> >>> -typedef struct AddressSpaceOps AddressSpaceOps; >>> - >>> #define FOR_EACH_FLAT_RANGE(var, view) \ >>> for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) >>> >>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) >>> memory_region_unref(as->root); >>> } >>> >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as) >>> +{ >>> + if (!as->as_ops.iommu_get) { >>> + return NULL; >>> + } >>> + return as->as_ops.iommu_get(as); >>> +} >>> + >>> void address_space_destroy(AddressSpace *as) >>> { >>> MemoryRegion *root = as->root; >> >> -- >> David Gibson | I'll have my music baroque, and my code >> david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ >> | _way_ _around_! >> http://www.ozlabs.org/~dgibson > > >
Hi Yi L, On 03/11/2017 13:01, Liu, Yi L wrote: > From: Peter Xu <peterx@redhat.com> > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > spaces to store arch-specific hooks. > > The first hook I would like to introduce is iommu_get(). Return an > IOMMUObject behind the AddressSpace. David had an objection in the past about this method, saying that several IOMMUs could translate a single AS? https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html On ARM I think it works in general: In https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt, it is said "a given PCI device can only master through one IOMMU" > > For systems that have IOMMUs, we will create a special address > space per device which is different from system default address > space for it (please refer to pci_device_iommu_address_space()). > Normally when that happens, there will be one specific IOMMU (or > say, translation unit) stands right behind that new address space. standing > > This iommu_get() fetches that guy behind the address space. Here, > the guy is defined as IOMMUObject, which includes a notifier_list > so far, may extend in future. Along with IOMMUObject, a new iommu > notifier mechanism is introduced. It would be used for virt-svm. > Also IOMMUObject can further have a IOMMUObjectOps which is similar > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied relying > on MemoryRegion. > I think I would split this patch into a 1 first patch introducing the iommu object and a second adding the AS ops and address_space_iommu_get() > Signed-off-by: Peter Xu <peterx@redhat.com> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > --- > hw/core/Makefile.objs | 1 + > hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > include/exec/memory.h | 22 +++++++++++++++ > include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > memory.c | 10 +++++-- > 5 files changed, 162 insertions(+), 2 deletions(-) > create mode 100644 hw/core/iommu.c > create mode 100644 include/hw/core/iommu.h > > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > index f8d7a4a..d688412 100644 > --- a/hw/core/Makefile.objs > +++ b/hw/core/Makefile.objs > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > # irq.o needed for qdev GPIO handling: > common-obj-y += irq.o > common-obj-y += hotplug.o > +common-obj-y += iommu.o > common-obj-y += nmi.o > > common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > diff --git a/hw/core/iommu.c b/hw/core/iommu.c > new file mode 100644 > index 0000000..7c4fcfe > --- /dev/null > +++ b/hw/core/iommu.c > @@ -0,0 +1,58 @@ > +/* > + * QEMU emulation of IOMMU logic May be rephrased as it does not really explain what the iommu object exposes as an API > + * > + * Copyright (C) 2017 Red Hat Inc. > + * > + * Authors: Peter Xu <peterx@redhat.com>, > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#include "qemu/osdep.h" > +#include "hw/core/iommu.h" > + IOMMUNotifier *n) > + > +void iommu_notifier_register(IOMMUObject *iommu, > + IOMMUNotifier *n, > + IOMMUNotifyFn fn, > + IOMMUEvent event) > +{ > + n->event = event; > + n->iommu_notify = fn; > + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > + return; > +} > + > +void iommu_notifier_unregister(IOMMUObject *iommu, > + IOMMUNotifier *notifier) > +{ > + IOMMUNotifier *cur, *next; > + > + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > + if (cur == notifier) { > + QLIST_REMOVE(cur, node); > + break; > + } > + } > +} > + > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > +{ > + IOMMUNotifier *cur; > + > + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > + if ((cur->event == event_data->event) && cur->iommu_notify) { can cur->iommu_notify be NULL if registered as above? > + cur->iommu_notify(cur, event_data); > + } > + } > +} > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 03595e3..8350973 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -26,6 +26,7 @@ > #include "qom/object.h" > #include "qemu/rcu.h" > #include "hw/qdev-core.h" > +#include "hw/core/iommu.h" > > #define RAM_ADDR_INVALID (~(ram_addr_t)0) > > @@ -301,6 +302,19 @@ struct MemoryListener { > }; > > /** > + * AddressSpaceOps: callbacks structure for address space specific operations > + * > + * @iommu_get: returns an IOMMU object that backs the address space. > + * Normally this should be NULL for generic address > + * spaces, and it's only used when there is one > + * translation unit behind this address space. > + */ > +struct AddressSpaceOps { > + IOMMUObject *(*iommu_get)(AddressSpace *as); > +}; > +typedef struct AddressSpaceOps AddressSpaceOps; > + > +/** > * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > */ > struct AddressSpace { > @@ -316,6 +330,7 @@ struct AddressSpace { > struct MemoryRegionIoeventfd *ioeventfds; > QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > + AddressSpaceOps as_ops; > }; > > FlatView *address_space_to_flatview(AddressSpace *as); > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > } > > +/** > + * address_space_iommu_get: Get the backend IOMMU for the address space > + * > + * @as: the address space to fetch IOMMU from > + */ > +IOMMUObject *address_space_iommu_get(AddressSpace *as); > + > #endif > > #endif > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > new file mode 100644 > index 0000000..34387c0 > --- /dev/null > +++ b/include/hw/core/iommu.h > @@ -0,0 +1,73 @@ > +/* > + * QEMU emulation of IOMMU logic > + * > + * Copyright (C) 2017 Red Hat Inc. > + * > + * Authors: Peter Xu <peterx@redhat.com>, > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#ifndef HW_CORE_IOMMU_H > +#define HW_CORE_IOMMU_H > + > +#include "qemu/queue.h" > + > +enum IOMMUEvent { > + IOMMU_EVENT_BIND_PASIDT, > +}; > +typedef enum IOMMUEvent IOMMUEvent; > + > +struct IOMMUEventData { > + IOMMUEvent event; /* length and opaque data passed to notifiers */ ? > + uint64_t length; > + void *data; > +}; > +typedef struct IOMMUEventData IOMMUEventData; > + > +typedef struct IOMMUNotifier IOMMUNotifier; > + > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > + IOMMUEventData *event_data); > + > +struct IOMMUNotifier { > + IOMMUNotifyFn iommu_notify; > + /* > + * What events we are listening to. Let's allow multiple event > + * registrations from beginning. > + */ > + IOMMUEvent event; /* the event the notifier is sensitive to ? */ > + QLIST_ENTRY(IOMMUNotifier) node; > +}; > + > +typedef struct IOMMUObject IOMMUObject; > + > +/* > + * This stands for an IOMMU unit. Any translation device should have > + * this struct inside its own structure to make sure it can leverage > + * common IOMMU functionalities. > + */ > +struct IOMMUObject { > + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; where is the QLIST_INIT supposed to be done? Thanks Eric > +}; > + > +void iommu_notifier_register(IOMMUObject *iommu, > + IOMMUNotifier *n, > + IOMMUNotifyFn fn, > + IOMMUEvent event); > +void iommu_notifier_unregister(IOMMUObject *iommu, > + IOMMUNotifier *notifier); > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > + > +#endif > diff --git a/memory.c b/memory.c > index 77fb3ef..307f665 100644 > --- a/memory.c > +++ b/memory.c > @@ -235,8 +235,6 @@ struct FlatView { > MemoryRegion *root; > }; > > -typedef struct AddressSpaceOps AddressSpaceOps; > - > #define FOR_EACH_FLAT_RANGE(var, view) \ > for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > memory_region_unref(as->root); > } > > +IOMMUObject *address_space_iommu_get(AddressSpace *as) > +{ > + if (!as->as_ops.iommu_get) { > + return NULL; > + } > + return as->as_ops.iommu_get(as); > +} > + > void address_space_destroy(AddressSpace *as) > { > MemoryRegion *root = as->root; >
On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote: Hi Eric, > Hi Yi L, > > On 13/11/2017 10:58, Liu, Yi L wrote: > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > >> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > >>> From: Peter Xu <peterx@redhat.com> > >>> > >>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address > >>> spaces to store arch-specific hooks. > >>> > >>> The first hook I would like to introduce is iommu_get(). Return an > >>> IOMMUObject behind the AddressSpace. > >>> > >>> For systems that have IOMMUs, we will create a special address > >>> space per device which is different from system default address > >>> space for it (please refer to pci_device_iommu_address_space()). > >>> Normally when that happens, there will be one specific IOMMU (or > >>> say, translation unit) stands right behind that new address space. > >>> > >>> This iommu_get() fetches that guy behind the address space. Here, > >>> the guy is defined as IOMMUObject, which includes a notifier_list > >>> so far, may extend in future. Along with IOMMUObject, a new iommu > >>> notifier mechanism is introduced. It would be used for virt-svm. > >>> Also IOMMUObject can further have a IOMMUObjectOps which is similar > >>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > >>> on MemoryRegion. > >>> > >>> Signed-off-by: Peter Xu <peterx@redhat.com> > >>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > >> > >> Hi, sorry I didn't reply to the earlier postings of this after our > >> discussion in China. I've been sick several times and very busy. > > > > Hi David, > > > > Fully understood. I'll try my best to address your question. Also, > > feel free to input further questions, anyhow, the more we discuss the > > better work we done. > > > >> I still don't feel like there's an adequate explanation of exactly > >> what an IOMMUObject represents. Obviously it can represent more than > > > > IOMMUObject is aimed to represent the iommu itself. e.g. the iommu > > specific operations. One of the key purpose of IOMMUObject is to > > introduce a notifier framework to let iommu emulator to be able to > > do iommu operations other than MAP/UNMAP. As IOMMU grows more and > > more feature, MAP/UNMAP is not the only operation iommu emulator needs > > to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also > > has it. may correct me on it. As my cover letter mentioned, MR based > > notifier framework doesn’t work for the newly added IOMMU operations. > > Like bind guest pasid table pointer to host and propagate guest's > > iotlb flush to host. > > > >> a single translation window - since that's represented by the > >> IOMMUMR. But what exactly do all the MRs - or whatever else - that > >> are represented by the IOMMUObject have in common, from a functional > >> point of view. > > > > Let me take virt-SVM as an example. As far as I know, for virt-SVM, > > the implementation of different vendors are similar. The key design > > is to have a nested translation(aka. two stage translation). It is to > > have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA > > mapping. Similar to EPT based virt-MMU solution. > > > > In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can > > keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor > > needs to trap specific guest iommu operation and pass the gVA->gPA > > mapping knowledge to host through a notifier(newly added one). In VT-d, > > it is called bind guest pasid table to host. > > What I don't get is the PASID table is per extended context entry. I > understand this latter is indexed by PCI device function. And today MR > are created per PCIe device if I am not wrong. In my understanding, MR is more related to AddressSpace not exactly tagged with PCIe device. > So why can't we have 1 > new MR notifier dedicated to PASID table passing? My understanding is > the MR, having a 1-1 correspondence with a PCIe device and thus a > context could be of right granularity. Then I understand the only flags I didn't quite get your point regards to the "granlarity" here. May talk a little bit more here? > we currently have are NONE, MAP and UNMAP but couldn't we add a new one > for PASID TABLE passing? So this is not crystal clear to me why MR > notifiers are not adapted to PASID table passing. This is also my initial plan. You may get some detail in the link below. https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html In brief, the new notifier I want to add is not really MR related and just more like a behaviour of a translation unit. Also, as my cover letter mentioned that the MR notifiers would not be registered for VT-d(virt-svm) in region_add(). Then it requires to register MR notifiers out of the region_add(). Also paste below. I think it more or less breaks the consistency of MR notifiers. Also, I think existing MR notifiers are more related IOVA address translation(on VT-d it is 2nd level translation, for ARM is it stage 2?), and there is some existing codes highly rely on this assumption. e.g. the memory_replay introduced by Peter, the notifier node would affect the replay. If adding a non MAP/UNMAP notifier, it would break the logic. So it's also a reason to add a separate framework instead of just adding a new flag to the existing MR notifier framework. https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html + /* Check if vIOMMU exists */ + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { + if (memory_region_is_iommu(subregion)) { + IOMMUNotifier n1; + + /* + FIXME: current iommu notifier is actually designed for + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need + notifiers other than MAP/UNMAP, so it'll be better to + split the non-IOMMUTLB notifier from the current IOMMUTLB + notifier framewrok. + */ + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, + IOMMU_NOTIFIER_SVM_PASIDT_BIND, + 0, + 0); + vfio_register_notifier(group->container, + subregion, + 0, + &n1); + } + } + > > > > Also, for the gVA iotlb flushing, only guest knows it. So hypervisor > > needs to propagate it to host. Here, MAP/UNMAP is not suitable since > > this gVA iotlb flush here doesn’t require to modify host iommu > > translation table. > I don't really get this argument. IOMMUNotifier just is a notifier that > is attached to an IOMMU MR and calls a an IOMMUNotify function, right? yes, it is. > Then the role of the function currently is attached to the currently > existing flags, MAP, UNMAP. This is not linked to an action on the > physical IOMMU, right? The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point here is MAP/UNMAP finally would talk to physical IOMMU change the translation page table in memory. However, for virt-svm case, the translation page table is the process vaddr page table(though the I/O page table is also used we don't need to talk it since hypervisor owns it). process vaddr page table is owned by guest, changes to the translation page table is by guest. So for such cache, just need to flush the cache in iommu side. no need to modify translation page table. > > At the time gVA iotlb flush is issued, the gVA->gPA > > mapping has already modified. Host iommu only needs to reference it when > > performing address translation. But before host iommu perform translation, > > it needs to flush the old gVA cache. In VT-d, it is called 1st level cache > > flushing. > > The fact MR notifiers may not be relevant could be linked to the nature > of the tagging of the structures you want to flush. My understanding is > if you want to flush by source-id, MR granularity could be fine. Please > could you clarify why do you need an iommu wide operation in that case. The flush is not limited to source-id granularity, it would be page selected and others. As I mentioned, it has no requirement to modify the translation page table, so it is more like a translation unit behavior. > > > > Both of the two notifiers(operations) has no direct relationship with MR, > > instead they highly depends on virt-iommu itself. > As described above this is not obvious to me. Please could you clarify > why source-id granularity (which I understand has a 1-1 granularity with > MR/AS is not the correct granularity). Of course, please correct me if > my understanding of MR mapping is not correct. It's correct that for the PCIe device, the iova address space(aka PCI address space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address space is not limted to iova address space, it has process address space, how can such an address space relate to a MR... Thanks, Yi L > > Thanks > > Eric > > If virt-iommu exists, > > then the two notfiers are needed, if not, then it's not. > > > >> Even understanding the SVM stuff better than I did, I don't really see > >> why an AddressSpace is an obvious unit to have an IOMMUObject > >> associated with it. > > > > This will benefit the notifier registration. As my comments above, the > > IOMMUObject is to represent iommu. Associate an AddressSpace with an > > IOMMUObject makes it easy to check if it is necessary to register the > > notifiers. > If no IOMMUObject, means no virt-iommu exposed to guest, then > > no need to register notifiers. For this, I also considered to use the > > MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the > > subregions and register notfiers if it is an iommu MemoryRegion. Peter > > mentioned it may not work for SPAR. So he proposed associating an > > AddressSpace with an IOMMUObject. I think it wroks and easier, so I > > didn’t object it. > > > > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html > > > > + /* Check if vIOMMU exists */ > > + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { > > + if (memory_region_is_iommu(subregion)) { > > + IOMMUNotifier n1; > > + > > + /* > > + FIXME: current iommu notifier is actually designed for > > + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need > > + notifiers other than MAP/UNMAP, so it'll be better to > > + split the non-IOMMUTLB notifier from the current IOMMUTLB > > + notifier framewrok. > > + */ > > + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, > > + IOMMU_NOTIFIER_SVM_PASIDT_BIND, > > + 0, > > + 0); > > + vfio_register_notifier(group->container, > > + subregion, > > + 0, > > + &n1); > > + } > > + } > > + > > > > Thanks, > > Yi L > > > >>> --- > >>> hw/core/Makefile.objs | 1 + > >>> hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > >>> include/exec/memory.h | 22 +++++++++++++++ > >>> include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > >>> memory.c | 10 +++++-- > >>> 5 files changed, 162 insertions(+), 2 deletions(-) > >>> create mode 100644 hw/core/iommu.c > >>> create mode 100644 include/hw/core/iommu.h > >>> > >>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > >>> index f8d7a4a..d688412 100644 > >>> --- a/hw/core/Makefile.objs > >>> +++ b/hw/core/Makefile.objs > >>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > >>> # irq.o needed for qdev GPIO handling: > >>> common-obj-y += irq.o > >>> common-obj-y += hotplug.o > >>> +common-obj-y += iommu.o > >>> common-obj-y += nmi.o > >>> > >>> common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > >>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c > >>> new file mode 100644 > >>> index 0000000..7c4fcfe > >>> --- /dev/null > >>> +++ b/hw/core/iommu.c > >>> @@ -0,0 +1,58 @@ > >>> +/* > >>> + * QEMU emulation of IOMMU logic > >>> + * > >>> + * Copyright (C) 2017 Red Hat Inc. > >>> + * > >>> + * Authors: Peter Xu <peterx@redhat.com>, > >>> + * > >>> + * This program is free software; you can redistribute it and/or modify > >>> + * it under the terms of the GNU General Public License as published by > >>> + * the Free Software Foundation; either version 2 of the License, or > >>> + * (at your option) any later version. > >>> + > >>> + * This program is distributed in the hope that it will be useful, > >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>> + * GNU General Public License for more details. > >>> + > >>> + * You should have received a copy of the GNU General Public License along > >>> + * with this program; if not, see <http://www.gnu.org/licenses/>. > >>> + */ > >>> + > >>> +#include "qemu/osdep.h" > >>> +#include "hw/core/iommu.h" > >>> + > >>> +void iommu_notifier_register(IOMMUObject *iommu, > >>> + IOMMUNotifier *n, > >>> + IOMMUNotifyFn fn, > >>> + IOMMUEvent event) > >>> +{ > >>> + n->event = event; > >>> + n->iommu_notify = fn; > >>> + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > >>> + return; > >>> +} > >>> + > >>> +void iommu_notifier_unregister(IOMMUObject *iommu, > >>> + IOMMUNotifier *notifier) > >>> +{ > >>> + IOMMUNotifier *cur, *next; > >>> + > >>> + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > >>> + if (cur == notifier) { > >>> + QLIST_REMOVE(cur, node); > >>> + break; > >>> + } > >>> + } > >>> +} > >>> + > >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > >>> +{ > >>> + IOMMUNotifier *cur; > >>> + > >>> + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > >>> + if ((cur->event == event_data->event) && cur->iommu_notify) { > >>> + cur->iommu_notify(cur, event_data); > >>> + } > >>> + } > >>> +} > >>> diff --git a/include/exec/memory.h b/include/exec/memory.h > >>> index 03595e3..8350973 100644 > >>> --- a/include/exec/memory.h > >>> +++ b/include/exec/memory.h > >>> @@ -26,6 +26,7 @@ > >>> #include "qom/object.h" > >>> #include "qemu/rcu.h" > >>> #include "hw/qdev-core.h" > >>> +#include "hw/core/iommu.h" > >>> > >>> #define RAM_ADDR_INVALID (~(ram_addr_t)0) > >>> > >>> @@ -301,6 +302,19 @@ struct MemoryListener { > >>> }; > >>> > >>> /** > >>> + * AddressSpaceOps: callbacks structure for address space specific operations > >>> + * > >>> + * @iommu_get: returns an IOMMU object that backs the address space. > >>> + * Normally this should be NULL for generic address > >>> + * spaces, and it's only used when there is one > >>> + * translation unit behind this address space. > >>> + */ > >>> +struct AddressSpaceOps { > >>> + IOMMUObject *(*iommu_get)(AddressSpace *as); > >>> +}; > >>> +typedef struct AddressSpaceOps AddressSpaceOps; > >>> + > >>> +/** > >>> * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > >>> */ > >>> struct AddressSpace { > >>> @@ -316,6 +330,7 @@ struct AddressSpace { > >>> struct MemoryRegionIoeventfd *ioeventfds; > >>> QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > >>> QTAILQ_ENTRY(AddressSpace) address_spaces_link; > >>> + AddressSpaceOps as_ops; > >>> }; > >>> > >>> FlatView *address_space_to_flatview(AddressSpace *as); > >>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > >>> address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > >>> } > >>> > >>> +/** > >>> + * address_space_iommu_get: Get the backend IOMMU for the address space > >>> + * > >>> + * @as: the address space to fetch IOMMU from > >>> + */ > >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as); > >>> + > >>> #endif > >>> > >>> #endif > >>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > >>> new file mode 100644 > >>> index 0000000..34387c0 > >>> --- /dev/null > >>> +++ b/include/hw/core/iommu.h > >>> @@ -0,0 +1,73 @@ > >>> +/* > >>> + * QEMU emulation of IOMMU logic > >>> + * > >>> + * Copyright (C) 2017 Red Hat Inc. > >>> + * > >>> + * Authors: Peter Xu <peterx@redhat.com>, > >>> + * > >>> + * This program is free software; you can redistribute it and/or modify > >>> + * it under the terms of the GNU General Public License as published by > >>> + * the Free Software Foundation; either version 2 of the License, or > >>> + * (at your option) any later version. > >>> + > >>> + * This program is distributed in the hope that it will be useful, > >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>> + * GNU General Public License for more details. > >>> + > >>> + * You should have received a copy of the GNU General Public License along > >>> + * with this program; if not, see <http://www.gnu.org/licenses/>. > >>> + */ > >>> + > >>> +#ifndef HW_CORE_IOMMU_H > >>> +#define HW_CORE_IOMMU_H > >>> + > >>> +#include "qemu/queue.h" > >>> + > >>> +enum IOMMUEvent { > >>> + IOMMU_EVENT_BIND_PASIDT, > >>> +}; > >>> +typedef enum IOMMUEvent IOMMUEvent; > >>> + > >>> +struct IOMMUEventData { > >>> + IOMMUEvent event; > >>> + uint64_t length; > >>> + void *data; > >>> +}; > >>> +typedef struct IOMMUEventData IOMMUEventData; > >>> + > >>> +typedef struct IOMMUNotifier IOMMUNotifier; > >>> + > >>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > >>> + IOMMUEventData *event_data); > >>> + > >>> +struct IOMMUNotifier { > >>> + IOMMUNotifyFn iommu_notify; > >>> + /* > >>> + * What events we are listening to. Let's allow multiple event > >>> + * registrations from beginning. > >>> + */ > >>> + IOMMUEvent event; > >>> + QLIST_ENTRY(IOMMUNotifier) node; > >>> +}; > >>> + > >>> +typedef struct IOMMUObject IOMMUObject; > >>> + > >>> +/* > >>> + * This stands for an IOMMU unit. Any translation device should have > >>> + * this struct inside its own structure to make sure it can leverage > >>> + * common IOMMU functionalities. > >>> + */ > >>> +struct IOMMUObject { > >>> + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > >>> +}; > >>> + > >>> +void iommu_notifier_register(IOMMUObject *iommu, > >>> + IOMMUNotifier *n, > >>> + IOMMUNotifyFn fn, > >>> + IOMMUEvent event); > >>> +void iommu_notifier_unregister(IOMMUObject *iommu, > >>> + IOMMUNotifier *notifier); > >>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > >>> + > >>> +#endif > >>> diff --git a/memory.c b/memory.c > >>> index 77fb3ef..307f665 100644 > >>> --- a/memory.c > >>> +++ b/memory.c > >>> @@ -235,8 +235,6 @@ struct FlatView { > >>> MemoryRegion *root; > >>> }; > >>> > >>> -typedef struct AddressSpaceOps AddressSpaceOps; > >>> - > >>> #define FOR_EACH_FLAT_RANGE(var, view) \ > >>> for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > >>> > >>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > >>> memory_region_unref(as->root); > >>> } > >>> > >>> +IOMMUObject *address_space_iommu_get(AddressSpace *as) > >>> +{ > >>> + if (!as->as_ops.iommu_get) { > >>> + return NULL; > >>> + } > >>> + return as->as_ops.iommu_get(as); > >>> +} > >>> + > >>> void address_space_destroy(AddressSpace *as) > >>> { > >>> MemoryRegion *root = as->root; > >> > >> -- > >> David Gibson | I'll have my music baroque, and my code > >> david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > >> | _way_ _around_! > >> http://www.ozlabs.org/~dgibson > > > > > > >
Hi Eric, On Tue, Nov 14, 2017 at 11:21:59AM +0100, Auger Eric wrote: > Hi Yi L, > > On 03/11/2017 13:01, Liu, Yi L wrote: > > From: Peter Xu <peterx@redhat.com> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > spaces to store arch-specific hooks. > > > > The first hook I would like to introduce is iommu_get(). Return an > > IOMMUObject behind the AddressSpace. > > David had an objection in the past about this method, saying that > several IOMMUs could translate a single AS? > > https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html > > On ARM I think it works in general: > In > https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt, > it is said > "a given PCI device can only master through one IOMMU" > > > > For systems that have IOMMUs, we will create a special address > > space per device which is different from system default address > > space for it (please refer to pci_device_iommu_address_space()). > > Normally when that happens, there will be one specific IOMMU (or > > say, translation unit) stands right behind that new address space. > standing > > > > This iommu_get() fetches that guy behind the address space. Here, > > the guy is defined as IOMMUObject, which includes a notifier_list > > so far, may extend in future. Along with IOMMUObject, a new iommu > > notifier mechanism is introduced. It would be used for virt-svm. > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > relying > > on MemoryRegion. > > > I think I would split this patch into a 1 first patch introducing the > iommu object and a second adding the AS ops and address_space_iommu_get() Good point. > > Signed-off-by: Peter Xu <peterx@redhat.com> > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > --- > > hw/core/Makefile.objs | 1 + > > hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > > include/exec/memory.h | 22 +++++++++++++++ > > include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > > memory.c | 10 +++++-- > > 5 files changed, 162 insertions(+), 2 deletions(-) > > create mode 100644 hw/core/iommu.c > > create mode 100644 include/hw/core/iommu.h > > > > diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > > index f8d7a4a..d688412 100644 > > --- a/hw/core/Makefile.objs > > +++ b/hw/core/Makefile.objs > > @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > > # irq.o needed for qdev GPIO handling: > > common-obj-y += irq.o > > common-obj-y += hotplug.o > > +common-obj-y += iommu.o > > common-obj-y += nmi.o > > > > common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > > diff --git a/hw/core/iommu.c b/hw/core/iommu.c > > new file mode 100644 > > index 0000000..7c4fcfe > > --- /dev/null > > +++ b/hw/core/iommu.c > > @@ -0,0 +1,58 @@ > > +/* > > + * QEMU emulation of IOMMU logic > May be rephrased as it does not really explain what the iommu object > exposes as an API yes, may need to rephrase to address it clear. > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#include "qemu/osdep.h" > > +#include "hw/core/iommu.h" > > > + IOMMUNotifier *n) > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event) > > +{ > > + n->event = event; > > + n->iommu_notify = fn; > > + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > > + return; > > +} > > + > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier) > > +{ > > + IOMMUNotifier *cur, *next; > > + > > + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > > + if (cur == notifier) { > > + QLIST_REMOVE(cur, node); > > + break; > > + } > > + } > > +} > > + > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > > +{ > > + IOMMUNotifier *cur; > > + > > + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > > + if ((cur->event == event_data->event) && cur->iommu_notify) { > can cur->iommu_notify be NULL if registered as above? the cur->event is actually initialized with a event and also a iommu_notify. So if the cur->event meets the requested event type, then it should be non-NULL. > > + cur->iommu_notify(cur, event_data); > > + } > > + } > > +} > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > index 03595e3..8350973 100644 > > --- a/include/exec/memory.h > > +++ b/include/exec/memory.h > > @@ -26,6 +26,7 @@ > > #include "qom/object.h" > > #include "qemu/rcu.h" > > #include "hw/qdev-core.h" > > +#include "hw/core/iommu.h" > > > > #define RAM_ADDR_INVALID (~(ram_addr_t)0) > > > > @@ -301,6 +302,19 @@ struct MemoryListener { > > }; > > > > /** > > + * AddressSpaceOps: callbacks structure for address space specific operations > > + * > > + * @iommu_get: returns an IOMMU object that backs the address space. > > + * Normally this should be NULL for generic address > > + * spaces, and it's only used when there is one > > + * translation unit behind this address space. > > + */ > > +struct AddressSpaceOps { > > + IOMMUObject *(*iommu_get)(AddressSpace *as); > > +}; > > +typedef struct AddressSpaceOps AddressSpaceOps; > > + > > +/** > > * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > > */ > > struct AddressSpace { > > @@ -316,6 +330,7 @@ struct AddressSpace { > > struct MemoryRegionIoeventfd *ioeventfds; > > QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > > + AddressSpaceOps as_ops; > > }; > > > > FlatView *address_space_to_flatview(AddressSpace *as); > > @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > > address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > > } > > > > +/** > > + * address_space_iommu_get: Get the backend IOMMU for the address space > > + * > > + * @as: the address space to fetch IOMMU from > > + */ > > +IOMMUObject *address_space_iommu_get(AddressSpace *as); > > + > > #endif > > > > #endif > > diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > > new file mode 100644 > > index 0000000..34387c0 > > --- /dev/null > > +++ b/include/hw/core/iommu.h > > @@ -0,0 +1,73 @@ > > +/* > > + * QEMU emulation of IOMMU logic > > + * > > + * Copyright (C) 2017 Red Hat Inc. > > + * > > + * Authors: Peter Xu <peterx@redhat.com>, > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + > > + * You should have received a copy of the GNU General Public License along > > + * with this program; if not, see <http://www.gnu.org/licenses/>. > > + */ > > + > > +#ifndef HW_CORE_IOMMU_H > > +#define HW_CORE_IOMMU_H > > + > > +#include "qemu/queue.h" > > + > > +enum IOMMUEvent { > > + IOMMU_EVENT_BIND_PASIDT, > > +}; > > +typedef enum IOMMUEvent IOMMUEvent; > > + > > +struct IOMMUEventData { > > + IOMMUEvent event; > /* length and opaque data passed to notifiers */ ? yes, it is. But the "void *data" would be replaced with well defined structure. No plan to do it as opaque. Once this patchset is accepted, later patchset would define it as; struct IOMMUEventData { IOMMUEvent event; uint64_t length; union { struct pasid_table_config pasidt_info; struct tlb_invalidate_info tlb_info; }; }; typedef struct IOMMUEventData IOMMUEventData; This is in my further patchset. Currently, we want to show the idea of introducing this new notifier framework. > > + uint64_t length; > > + void *data; > > +}; > > +typedef struct IOMMUEventData IOMMUEventData; > > + > > +typedef struct IOMMUNotifier IOMMUNotifier; > > + > > +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > > + IOMMUEventData *event_data); > > + > > +struct IOMMUNotifier { > > + IOMMUNotifyFn iommu_notify; > > + /* > > + * What events we are listening to. Let's allow multiple event > > + * registrations from beginning. > > + */ > > + IOMMUEvent event; > /* the event the notifier is sensitive to ? */ events(aka. operations) like bind pasid table/flush 1st level tlb. etc. > > + QLIST_ENTRY(IOMMUNotifier) node; > > +}; > > + > > +typedef struct IOMMUObject IOMMUObject; > > + > > +/* > > + * This stands for an IOMMU unit. Any translation device should have > > + * this struct inside its own structure to make sure it can leverage > > + * common IOMMU functionalities. > > + */ > > +struct IOMMUObject { > > + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > where is the QLIST_INIT supposed to be done? yeah, need to add it accordingly. Thanks, Yi L > Thanks > > Eric > > +}; > > + > > +void iommu_notifier_register(IOMMUObject *iommu, > > + IOMMUNotifier *n, > > + IOMMUNotifyFn fn, > > + IOMMUEvent event); > > +void iommu_notifier_unregister(IOMMUObject *iommu, > > + IOMMUNotifier *notifier); > > +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > > + > > +#endif > > diff --git a/memory.c b/memory.c > > index 77fb3ef..307f665 100644 > > --- a/memory.c > > +++ b/memory.c > > @@ -235,8 +235,6 @@ struct FlatView { > > MemoryRegion *root; > > }; > > > > -typedef struct AddressSpaceOps AddressSpaceOps; > > - > > #define FOR_EACH_FLAT_RANGE(var, view) \ > > for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > > > > @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > > memory_region_unref(as->root); > > } > > > > +IOMMUObject *address_space_iommu_get(AddressSpace *as) > > +{ > > + if (!as->as_ops.iommu_get) { > > + return NULL; > > + } > > + return as->as_ops.iommu_get(as); > > +} > > + > > void address_space_destroy(AddressSpace *as) > > { > > MemoryRegion *root = as->root; > > >
Hi Yi L, On 14/11/2017 14:59, Liu, Yi L wrote: > On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote: > Hi Eric, > >> Hi Yi L, >> >> On 13/11/2017 10:58, Liu, Yi L wrote: >>> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: >>>> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: >>>>> From: Peter Xu <peterx@redhat.com> >>>>> >>>>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address >>>>> spaces to store arch-specific hooks. >>>>> >>>>> The first hook I would like to introduce is iommu_get(). Return an >>>>> IOMMUObject behind the AddressSpace. >>>>> >>>>> For systems that have IOMMUs, we will create a special address >>>>> space per device which is different from system default address >>>>> space for it (please refer to pci_device_iommu_address_space()). >>>>> Normally when that happens, there will be one specific IOMMU (or >>>>> say, translation unit) stands right behind that new address space. >>>>> >>>>> This iommu_get() fetches that guy behind the address space. Here, >>>>> the guy is defined as IOMMUObject, which includes a notifier_list >>>>> so far, may extend in future. Along with IOMMUObject, a new iommu >>>>> notifier mechanism is introduced. It would be used for virt-svm. >>>>> Also IOMMUObject can further have a IOMMUObjectOps which is similar >>>>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied >>>>> on MemoryRegion. >>>>> >>>>> Signed-off-by: Peter Xu <peterx@redhat.com> >>>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> >>>> >>>> Hi, sorry I didn't reply to the earlier postings of this after our >>>> discussion in China. I've been sick several times and very busy. >>> >>> Hi David, >>> >>> Fully understood. I'll try my best to address your question. Also, >>> feel free to input further questions, anyhow, the more we discuss the >>> better work we done. >>> >>>> I still don't feel like there's an adequate explanation of exactly >>>> what an IOMMUObject represents. Obviously it can represent more than >>> >>> IOMMUObject is aimed to represent the iommu itself. e.g. the iommu >>> specific operations. One of the key purpose of IOMMUObject is to >>> introduce a notifier framework to let iommu emulator to be able to >>> do iommu operations other than MAP/UNMAP. As IOMMU grows more and >>> more feature, MAP/UNMAP is not the only operation iommu emulator needs >>> to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also >>> has it. may correct me on it. As my cover letter mentioned, MR based >>> notifier framework doesn’t work for the newly added IOMMU operations. >>> Like bind guest pasid table pointer to host and propagate guest's >>> iotlb flush to host. >>> >>>> a single translation window - since that's represented by the >>>> IOMMUMR. But what exactly do all the MRs - or whatever else - that >>>> are represented by the IOMMUObject have in common, from a functional >>>> point of view. >>> >>> Let me take virt-SVM as an example. As far as I know, for virt-SVM, >>> the implementation of different vendors are similar. The key design >>> is to have a nested translation(aka. two stage translation). It is to >>> have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA >>> mapping. Similar to EPT based virt-MMU solution. >>> >>> In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can >>> keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor >>> needs to trap specific guest iommu operation and pass the gVA->gPA >>> mapping knowledge to host through a notifier(newly added one). In VT-d, >>> it is called bind guest pasid table to host. >> >> What I don't get is the PASID table is per extended context entry. I >> understand this latter is indexed by PCI device function. And today MR >> are created per PCIe device if I am not wrong. > > In my understanding, MR is more related to AddressSpace not exactly tagged > with PCIe device. I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 IOMMU MR and 1 AS per PCIe device, right? > >> So why can't we have 1 >> new MR notifier dedicated to PASID table passing? My understanding is >> the MR, having a 1-1 correspondence with a PCIe device and thus a >> context could be of right granularity. Then I understand the only flags > > I didn't quite get your point regards to the "granlarity" here. May talk > a little bit more here? The PASID table is per device (contained by extended context which is dev/fn indexed). The "record_device" notifier also is attached to a specific PCIe device. So we can't really say they have an iommu wide scope (PCIe device granularity would fit). However I understand from below explanation that TLB invalidate notifier is not especially tight to a given source-id as we are going to invalidate by PASID/page. I think the main justification behind introducing this new framework is that PT is set along with SVM and in this case the IOMMU MR notifiers are not registered since the IOMMU MR is disabled. So new notifiers do not fit nicely in current framework. > >> we currently have are NONE, MAP and UNMAP but couldn't we add a new one >> for PASID TABLE passing? So this is not crystal clear to me why MR >> notifiers are not adapted to PASID table passing. > > This is also my initial plan. You may get some detail in the link below. > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html > > In brief, the new notifier I want to add is not really MR related and > just more like a behaviour of a translation unit. Also, as my cover letter > mentioned that the MR notifiers would not be registered for VT-d(virt-svm) > in region_add(). Then it requires to register MR notifiers out of the > region_add(). Yes to me this is the main issue: IOMMU MR are not registered in PT mode. Commit message of "[RFC PATCH 06/20] VFIO: add new notifier for binding PASID table" helped me to introduce the problem, although I did not quite understand what you call "vtd_address_space". Also paste below. I think it more or less breaks the > consistency of MR notifiers. Also, I think existing MR notifiers are more > related IOVA address translation(on VT-d it is 2nd level translation, for ARM > is it stage 2?), Well, level 1 input address is IOVA as well. On ARM, IOVA -> IPA*=GPA is stage1 and GPA -> HPA is stage2. So I would rather say that for ARM, MR relates to stage1 which is the one emulated by vSMMU along with VFIO. *IPA = intermediate physical address. and there is some existing codes highly rely on this > assumption. e.g. the memory_replay introduced by Peter, the notifier node > would affect the replay. If adding a non MAP/UNMAP notifier, it would break > the logic. So it's also a reason to add a separate framework instead of > just adding a new flag to the existing MR notifier framework. > > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html > > + /* Check if vIOMMU exists */ > + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { > + if (memory_region_is_iommu(subregion)) { > + IOMMUNotifier n1; > + > + /* > + FIXME: current iommu notifier is actually designed for > + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need > + notifiers other than MAP/UNMAP, so it'll be better to > + split the non-IOMMUTLB notifier from the current IOMMUTLB > + notifier framewrok. > + */ > + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, > + IOMMU_NOTIFIER_SVM_PASIDT_BIND, > + 0, > + 0); > + vfio_register_notifier(group->container, > + subregion, > + 0, > + &n1); > + } > + } > + > > >>> >>> Also, for the gVA iotlb flushing, only guest knows it. So hypervisor >>> needs to propagate it to host. Here, MAP/UNMAP is not suitable since >>> this gVA iotlb flush here doesn’t require to modify host iommu >>> translation table. >> I don't really get this argument. IOMMUNotifier just is a notifier that >> is attached to an IOMMU MR and calls a an IOMMUNotify function, right? > > yes, it is. > >> Then the role of the function currently is attached to the currently >> existing flags, MAP, UNMAP. This is not linked to an action on the >> physical IOMMU, right? > > The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point > here is MAP/UNMAP finally would talk to physical IOMMU change the translation > page table in memory. yes it programs the pIOMMU with combined translation stages. However, for virt-svm case, the translation page > table is the process vaddr page table(though the I/O page table is also used > we don't need to talk it since hypervisor owns it). process vaddr page table > is owned by guest, changes to the translation page table is by guest. So for > such cache, just need to flush the cache in iommu side. no need to modify > translation page table. agreed but the MR notifier is not necessarily supposed to touch table, right? > >> >> At the time gVA iotlb flush is issued, the gVA->gPA >>> mapping has already modified. Host iommu only needs to reference it when >>> performing address translation. But before host iommu perform translation, >>> it needs to flush the old gVA cache. In VT-d, it is called 1st level cache >>> flushing. >> >> The fact MR notifiers may not be relevant could be linked to the nature >> of the tagging of the structures you want to flush. My understanding is >> if you want to flush by source-id, MR granularity could be fine. Please >> could you clarify why do you need an iommu wide operation in that case. > > The flush is not limited to source-id granularity, it would be page selected > and others. As I mentioned, it has no requirement to modify the translation > page table, so it is more like a translation unit behavior. we could also enumerate all MR notifiers - assuming they are registered! - and call notifiers for each of them, right? Maybe this is not the strongest argument. > >>> >>> Both of the two notifiers(operations) has no direct relationship with MR, >>> instead they highly depends on virt-iommu itself. >> As described above this is not obvious to me. Please could you clarify >> why source-id granularity (which I understand has a 1-1 granularity with >> MR/AS is not the correct granularity). Of course, please correct me if >> my understanding of MR mapping is not correct. > > It's correct that for the PCIe device, the iova address space(aka PCI address > space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address > space is not limted to iova address space, it has process address space, how > can such an address space relate to a MR... So, this discussion, reading again the cover letter, and digging into your original series helped me understand the need for those new notifiers. What confused me originally was the "iommu" wide notifiers. Maybe you should clarify your cover letter by explicitly saying that - SVM works along with PT = 1 - if PT = 1 IOMMU MR are disabled so MR notifier are not registered - new notifiers do not fit nicely in this framework as they need to be registered even if PT = 1 - having those notifiers attached to the whole IOMMU object abstraction is not an issue Please correct me if I said anything wrong and please apologize if I brought any further confusion. Thanks Eric > > Thanks, > Yi L >> >> Thanks >> >> Eric >> >> If virt-iommu exists, >>> then the two notfiers are needed, if not, then it's not. >>> >>>> Even understanding the SVM stuff better than I did, I don't really see >>>> why an AddressSpace is an obvious unit to have an IOMMUObject >>>> associated with it. >>> >>> This will benefit the notifier registration. As my comments above, the >>> IOMMUObject is to represent iommu. Associate an AddressSpace with an >>> IOMMUObject makes it easy to check if it is necessary to register the >>> notifiers. >> If no IOMMUObject, means no virt-iommu exposed to guest, then >>> no need to register notifiers. For this, I also considered to use the >>> MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the >>> subregions and register notfiers if it is an iommu MemoryRegion. Peter >>> mentioned it may not work for SPAR. So he proposed associating an >>> AddressSpace with an IOMMUObject. I think it wroks and easier, so I >>> didn’t object it. >>> >>> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html >>> >>> + /* Check if vIOMMU exists */ >>> + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { >>> + if (memory_region_is_iommu(subregion)) { >>> + IOMMUNotifier n1; >>> + >>> + /* >>> + FIXME: current iommu notifier is actually designed for >>> + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need >>> + notifiers other than MAP/UNMAP, so it'll be better to >>> + split the non-IOMMUTLB notifier from the current IOMMUTLB >>> + notifier framewrok. >>> + */ >>> + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, >>> + IOMMU_NOTIFIER_SVM_PASIDT_BIND, >>> + 0, >>> + 0); >>> + vfio_register_notifier(group->container, >>> + subregion, >>> + 0, >>> + &n1); >>> + } >>> + } >>> + >>> >>> Thanks, >>> Yi L >>> >>>>> --- >>>>> hw/core/Makefile.objs | 1 + >>>>> hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ >>>>> include/exec/memory.h | 22 +++++++++++++++ >>>>> include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ >>>>> memory.c | 10 +++++-- >>>>> 5 files changed, 162 insertions(+), 2 deletions(-) >>>>> create mode 100644 hw/core/iommu.c >>>>> create mode 100644 include/hw/core/iommu.h >>>>> >>>>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs >>>>> index f8d7a4a..d688412 100644 >>>>> --- a/hw/core/Makefile.objs >>>>> +++ b/hw/core/Makefile.objs >>>>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o >>>>> # irq.o needed for qdev GPIO handling: >>>>> common-obj-y += irq.o >>>>> common-obj-y += hotplug.o >>>>> +common-obj-y += iommu.o >>>>> common-obj-y += nmi.o >>>>> >>>>> common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o >>>>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c >>>>> new file mode 100644 >>>>> index 0000000..7c4fcfe >>>>> --- /dev/null >>>>> +++ b/hw/core/iommu.c >>>>> @@ -0,0 +1,58 @@ >>>>> +/* >>>>> + * QEMU emulation of IOMMU logic >>>>> + * >>>>> + * Copyright (C) 2017 Red Hat Inc. >>>>> + * >>>>> + * Authors: Peter Xu <peterx@redhat.com>, >>>>> + * >>>>> + * This program is free software; you can redistribute it and/or modify >>>>> + * it under the terms of the GNU General Public License as published by >>>>> + * the Free Software Foundation; either version 2 of the License, or >>>>> + * (at your option) any later version. >>>>> + >>>>> + * This program is distributed in the hope that it will be useful, >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> + * GNU General Public License for more details. >>>>> + >>>>> + * You should have received a copy of the GNU General Public License along >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>. >>>>> + */ >>>>> + >>>>> +#include "qemu/osdep.h" >>>>> +#include "hw/core/iommu.h" >>>>> + >>>>> +void iommu_notifier_register(IOMMUObject *iommu, >>>>> + IOMMUNotifier *n, >>>>> + IOMMUNotifyFn fn, >>>>> + IOMMUEvent event) >>>>> +{ >>>>> + n->event = event; >>>>> + n->iommu_notify = fn; >>>>> + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); >>>>> + return; >>>>> +} >>>>> + >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu, >>>>> + IOMMUNotifier *notifier) >>>>> +{ >>>>> + IOMMUNotifier *cur, *next; >>>>> + >>>>> + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { >>>>> + if (cur == notifier) { >>>>> + QLIST_REMOVE(cur, node); >>>>> + break; >>>>> + } >>>>> + } >>>>> +} >>>>> + >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) >>>>> +{ >>>>> + IOMMUNotifier *cur; >>>>> + >>>>> + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { >>>>> + if ((cur->event == event_data->event) && cur->iommu_notify) { >>>>> + cur->iommu_notify(cur, event_data); >>>>> + } >>>>> + } >>>>> +} >>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h >>>>> index 03595e3..8350973 100644 >>>>> --- a/include/exec/memory.h >>>>> +++ b/include/exec/memory.h >>>>> @@ -26,6 +26,7 @@ >>>>> #include "qom/object.h" >>>>> #include "qemu/rcu.h" >>>>> #include "hw/qdev-core.h" >>>>> +#include "hw/core/iommu.h" >>>>> >>>>> #define RAM_ADDR_INVALID (~(ram_addr_t)0) >>>>> >>>>> @@ -301,6 +302,19 @@ struct MemoryListener { >>>>> }; >>>>> >>>>> /** >>>>> + * AddressSpaceOps: callbacks structure for address space specific operations >>>>> + * >>>>> + * @iommu_get: returns an IOMMU object that backs the address space. >>>>> + * Normally this should be NULL for generic address >>>>> + * spaces, and it's only used when there is one >>>>> + * translation unit behind this address space. >>>>> + */ >>>>> +struct AddressSpaceOps { >>>>> + IOMMUObject *(*iommu_get)(AddressSpace *as); >>>>> +}; >>>>> +typedef struct AddressSpaceOps AddressSpaceOps; >>>>> + >>>>> +/** >>>>> * AddressSpace: describes a mapping of addresses to #MemoryRegion objects >>>>> */ >>>>> struct AddressSpace { >>>>> @@ -316,6 +330,7 @@ struct AddressSpace { >>>>> struct MemoryRegionIoeventfd *ioeventfds; >>>>> QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; >>>>> QTAILQ_ENTRY(AddressSpace) address_spaces_link; >>>>> + AddressSpaceOps as_ops; >>>>> }; >>>>> >>>>> FlatView *address_space_to_flatview(AddressSpace *as); >>>>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, >>>>> address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); >>>>> } >>>>> >>>>> +/** >>>>> + * address_space_iommu_get: Get the backend IOMMU for the address space >>>>> + * >>>>> + * @as: the address space to fetch IOMMU from >>>>> + */ >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as); >>>>> + >>>>> #endif >>>>> >>>>> #endif >>>>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h >>>>> new file mode 100644 >>>>> index 0000000..34387c0 >>>>> --- /dev/null >>>>> +++ b/include/hw/core/iommu.h >>>>> @@ -0,0 +1,73 @@ >>>>> +/* >>>>> + * QEMU emulation of IOMMU logic >>>>> + * >>>>> + * Copyright (C) 2017 Red Hat Inc. >>>>> + * >>>>> + * Authors: Peter Xu <peterx@redhat.com>, >>>>> + * >>>>> + * This program is free software; you can redistribute it and/or modify >>>>> + * it under the terms of the GNU General Public License as published by >>>>> + * the Free Software Foundation; either version 2 of the License, or >>>>> + * (at your option) any later version. >>>>> + >>>>> + * This program is distributed in the hope that it will be useful, >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> + * GNU General Public License for more details. >>>>> + >>>>> + * You should have received a copy of the GNU General Public License along >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>. >>>>> + */ >>>>> + >>>>> +#ifndef HW_CORE_IOMMU_H >>>>> +#define HW_CORE_IOMMU_H >>>>> + >>>>> +#include "qemu/queue.h" >>>>> + >>>>> +enum IOMMUEvent { >>>>> + IOMMU_EVENT_BIND_PASIDT, >>>>> +}; >>>>> +typedef enum IOMMUEvent IOMMUEvent; >>>>> + >>>>> +struct IOMMUEventData { >>>>> + IOMMUEvent event; >>>>> + uint64_t length; >>>>> + void *data; >>>>> +}; >>>>> +typedef struct IOMMUEventData IOMMUEventData; >>>>> + >>>>> +typedef struct IOMMUNotifier IOMMUNotifier; >>>>> + >>>>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, >>>>> + IOMMUEventData *event_data); >>>>> + >>>>> +struct IOMMUNotifier { >>>>> + IOMMUNotifyFn iommu_notify; >>>>> + /* >>>>> + * What events we are listening to. Let's allow multiple event >>>>> + * registrations from beginning. >>>>> + */ >>>>> + IOMMUEvent event; >>>>> + QLIST_ENTRY(IOMMUNotifier) node; >>>>> +}; >>>>> + >>>>> +typedef struct IOMMUObject IOMMUObject; >>>>> + >>>>> +/* >>>>> + * This stands for an IOMMU unit. Any translation device should have >>>>> + * this struct inside its own structure to make sure it can leverage >>>>> + * common IOMMU functionalities. >>>>> + */ >>>>> +struct IOMMUObject { >>>>> + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; >>>>> +}; >>>>> + >>>>> +void iommu_notifier_register(IOMMUObject *iommu, >>>>> + IOMMUNotifier *n, >>>>> + IOMMUNotifyFn fn, >>>>> + IOMMUEvent event); >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu, >>>>> + IOMMUNotifier *notifier); >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); >>>>> + >>>>> +#endif >>>>> diff --git a/memory.c b/memory.c >>>>> index 77fb3ef..307f665 100644 >>>>> --- a/memory.c >>>>> +++ b/memory.c >>>>> @@ -235,8 +235,6 @@ struct FlatView { >>>>> MemoryRegion *root; >>>>> }; >>>>> >>>>> -typedef struct AddressSpaceOps AddressSpaceOps; >>>>> - >>>>> #define FOR_EACH_FLAT_RANGE(var, view) \ >>>>> for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) >>>>> >>>>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) >>>>> memory_region_unref(as->root); >>>>> } >>>>> >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as) >>>>> +{ >>>>> + if (!as->as_ops.iommu_get) { >>>>> + return NULL; >>>>> + } >>>>> + return as->as_ops.iommu_get(as); >>>>> +} >>>>> + >>>>> void address_space_destroy(AddressSpace *as) >>>>> { >>>>> MemoryRegion *root = as->root; >>>> >>>> -- >>>> David Gibson | I'll have my music baroque, and my code >>>> david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ >>>> | _way_ _around_! >>>> http://www.ozlabs.org/~dgibson >>> >>> >>> >> >
Hi Eric, On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > Hi Yi L, > > On 14/11/2017 14:59, Liu, Yi L wrote: > > On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote: > > Hi Eric, > > > >> Hi Yi L, > >> > >> On 13/11/2017 10:58, Liu, Yi L wrote: > >>> On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > >>>> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > >>>>> From: Peter Xu <peterx@redhat.com> > >>>>> > >>>>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address > >>>>> spaces to store arch-specific hooks. > >>>>> > >>>>> The first hook I would like to introduce is iommu_get(). Return an > >>>>> IOMMUObject behind the AddressSpace. > >>>>> > >>>>> For systems that have IOMMUs, we will create a special address > >>>>> space per device which is different from system default address > >>>>> space for it (please refer to pci_device_iommu_address_space()). > >>>>> Normally when that happens, there will be one specific IOMMU (or > >>>>> say, translation unit) stands right behind that new address space. > >>>>> > >>>>> This iommu_get() fetches that guy behind the address space. Here, > >>>>> the guy is defined as IOMMUObject, which includes a notifier_list > >>>>> so far, may extend in future. Along with IOMMUObject, a new iommu > >>>>> notifier mechanism is introduced. It would be used for virt-svm. > >>>>> Also IOMMUObject can further have a IOMMUObjectOps which is similar > >>>>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > >>>>> on MemoryRegion. > >>>>> > >>>>> Signed-off-by: Peter Xu <peterx@redhat.com> > >>>>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > >>>> > >>>> Hi, sorry I didn't reply to the earlier postings of this after our > >>>> discussion in China. I've been sick several times and very busy. > >>> > >>> Hi David, > >>> > >>> Fully understood. I'll try my best to address your question. Also, > >>> feel free to input further questions, anyhow, the more we discuss the > >>> better work we done. > >>> > >>>> I still don't feel like there's an adequate explanation of exactly > >>>> what an IOMMUObject represents. Obviously it can represent more than > >>> > >>> IOMMUObject is aimed to represent the iommu itself. e.g. the iommu > >>> specific operations. One of the key purpose of IOMMUObject is to > >>> introduce a notifier framework to let iommu emulator to be able to > >>> do iommu operations other than MAP/UNMAP. As IOMMU grows more and > >>> more feature, MAP/UNMAP is not the only operation iommu emulator needs > >>> to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also > >>> has it. may correct me on it. As my cover letter mentioned, MR based > >>> notifier framework doesn’t work for the newly added IOMMU operations. > >>> Like bind guest pasid table pointer to host and propagate guest's > >>> iotlb flush to host. > >>> > >>>> a single translation window - since that's represented by the > >>>> IOMMUMR. But what exactly do all the MRs - or whatever else - that > >>>> are represented by the IOMMUObject have in common, from a functional > >>>> point of view. > >>> > >>> Let me take virt-SVM as an example. As far as I know, for virt-SVM, > >>> the implementation of different vendors are similar. The key design > >>> is to have a nested translation(aka. two stage translation). It is to > >>> have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA > >>> mapping. Similar to EPT based virt-MMU solution. > >>> > >>> In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can > >>> keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor > >>> needs to trap specific guest iommu operation and pass the gVA->gPA > >>> mapping knowledge to host through a notifier(newly added one). In VT-d, > >>> it is called bind guest pasid table to host. > >> > >> What I don't get is the PASID table is per extended context entry. I > >> understand this latter is indexed by PCI device function. And today MR > >> are created per PCIe device if I am not wrong. > > > > In my understanding, MR is more related to AddressSpace not exactly tagged > > with PCIe device. > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 > IOMMU MR and 1 AS per PCIe device, right? yes, it is. This is the PCIe device address space, it's can be guest's physical address space if no vIOMMU exposed. Or it can be the guset IOVA address space if vIOMMU is epxosed. Both the two address space is treated as 2nd level transaltion in VT-d, which is different from the 1st level translation which is for process vaddr space. > > > >> So why can't we have 1 > >> new MR notifier dedicated to PASID table passing? My understanding is > >> the MR, having a 1-1 correspondence with a PCIe device and thus a > >> context could be of right granularity. Then I understand the only flags > > > > I didn't quite get your point regards to the "granlarity" here. May talk > > a little bit more here? > The PASID table is per device (contained by extended context which is > dev/fn indexed). The "record_device" notifier also is attached to a > specific PCIe device. So we can't really say they have an iommu wide > scope (PCIe device granularity would fit). However I understand from > below explanation that TLB invalidate notifier is not especially tight > to a given source-id as we are going to invalidate by PASID/page. > correct, it has no tight relationship to the "granlarity" here for the discussion to introduce this new notifer framework. > I think the main justification behind introducing this new framework is > that PT is set along with SVM and in this case the IOMMU MR notifiers > are not registered since the IOMMU MR is disabled. > > So new notifiers do not fit nicely in current framework. exactly, that's what I mean. Although we have tricky to fix it, but the trick is still a trick. Better to have a new and dedicate framework. > > > >> we currently have are NONE, MAP and UNMAP but couldn't we add a new one > >> for PASID TABLE passing? So this is not crystal clear to me why MR > >> notifiers are not adapted to PASID table passing. > > > > This is also my initial plan. You may get some detail in the link below. > > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg05295.html > > > > In brief, the new notifier I want to add is not really MR related and > > just more like a behaviour of a translation unit. Also, as my cover letter > > mentioned that the MR notifiers would not be registered for VT-d(virt-svm) > > in region_add(). Then it requires to register MR notifiers out of the > > region_add(). > > Yes to me this is the main issue: IOMMU MR are not registered in PT > mode. Commit message of "[RFC PATCH 06/20] VFIO: add new notifier for > binding PASID table" helped me to introduce the problem, although I did > not quite understand what you call "vtd_address_space". "vtd_address_space" is a wrapper of the DMA AddressSpace of a device behind a virtual VT-d. > Also paste below. I think it more or less breaks the > > consistency of MR notifiers. Also, I think existing MR notifiers are more > > related IOVA address translation(on VT-d it is 2nd level translation, for ARM > > is it stage 2?), > Well, level 1 input address is IOVA as well. > > On ARM, IOVA -> IPA*=GPA is stage1 and GPA -> HPA is stage2. So I would > rather say that for ARM, MR relates to stage1 which is the one emulated > by vSMMU along with VFIO. > *IPA = intermediate physical address. thx for correction. BTW. if you have virt-SVM on ARM, will stage 1 cover the gVA->gPA/IPA translation? > and there is some existing codes highly rely on this > > assumption. e.g. the memory_replay introduced by Peter, the notifier node > > would affect the replay. If adding a non MAP/UNMAP notifier, it would break > > the logic. So it's also a reason to add a separate framework instead of > > just adding a new flag to the existing MR notifier framework. > > > > > > https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html > > > > + /* Check if vIOMMU exists */ > > + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { > > + if (memory_region_is_iommu(subregion)) { > > + IOMMUNotifier n1; > > + > > + /* > > + FIXME: current iommu notifier is actually designed for > > + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need > > + notifiers other than MAP/UNMAP, so it'll be better to > > + split the non-IOMMUTLB notifier from the current IOMMUTLB > > + notifier framewrok. > > + */ > > + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, > > + IOMMU_NOTIFIER_SVM_PASIDT_BIND, > > + 0, > > + 0); > > + vfio_register_notifier(group->container, > > + subregion, > > + 0, > > + &n1); > > + } > > + } > > + > > > > > >>> > >>> Also, for the gVA iotlb flushing, only guest knows it. So hypervisor > >>> needs to propagate it to host. Here, MAP/UNMAP is not suitable since > >>> this gVA iotlb flush here doesn’t require to modify host iommu > >>> translation table. > >> I don't really get this argument. IOMMUNotifier just is a notifier that > >> is attached to an IOMMU MR and calls a an IOMMUNotify function, right? > > > > yes, it is. > > > >> Then the role of the function currently is attached to the currently > >> existing flags, MAP, UNMAP. This is not linked to an action on the > >> physical IOMMU, right? > > > > The MAP/UNMAP notifier finally talks to physical IOMMU. is it? My point > > here is MAP/UNMAP finally would talk to physical IOMMU change the translation > > page table in memory. > yes it programs the pIOMMU with combined translation stages. > However, for virt-svm case, the translation page > > table is the process vaddr page table(though the I/O page table is also used > > we don't need to talk it since hypervisor owns it). process vaddr page table > > is owned by guest, changes to the translation page table is by guest. So for > > such cache, just need to flush the cache in iommu side. no need to modify > > translation page table. > agreed but the MR notifier is not necessarily supposed to touch table, > right? Not sure for other platform, for VT-d, the existing MAP/UNMAP notifier would not touch the page table in Qemu, but they would touch through VFIO APIs. And I think it's necessary when it's registered. e.g. when virtual VT-d is exposed, guest can use guset IOVA, requires MAP/UNMAP APIs to shadow the mapping guest iommu driver builds. > > > >> > >> At the time gVA iotlb flush is issued, the gVA->gPA > >>> mapping has already modified. Host iommu only needs to reference it when > >>> performing address translation. But before host iommu perform translation, > >>> it needs to flush the old gVA cache. In VT-d, it is called 1st level cache > >>> flushing. > >> > >> The fact MR notifiers may not be relevant could be linked to the nature > >> of the tagging of the structures you want to flush. My understanding is > >> if you want to flush by source-id, MR granularity could be fine. Please > >> could you clarify why do you need an iommu wide operation in that case. > > > > The flush is not limited to source-id granularity, it would be page selected > > and others. As I mentioned, it has no requirement to modify the translation > > page table, so it is more like a translation unit behavior. > we could also enumerate all MR notifiers - assuming they are registered! > - and call notifiers for each of them, right? Maybe this is not the > strongest argument. yes, it's not the strongest argument for the implementaion. But for design, it is a strong argument to address the necessarity across different platforms. > > > >>> > >>> Both of the two notifiers(operations) has no direct relationship with MR, > >>> instead they highly depends on virt-iommu itself. > >> As described above this is not obvious to me. Please could you clarify > >> why source-id granularity (which I understand has a 1-1 granularity with > >> MR/AS is not the correct granularity). Of course, please correct me if > >> my understanding of MR mapping is not correct. > > > > It's correct that for the PCIe device, the iova address space(aka PCI address > > space) has kind of 1-1 relationship with MR. But, for virt-SVM, the address > > space is not limted to iova address space, it has process address space, how > > can such an address space relate to a MR... > > So, this discussion, reading again the cover letter, and digging into > your original series helped me understand the need for those new > notifiers. What confused me originally was the "iommu" wide notifiers. > Maybe you should clarify your cover letter by explicitly saying that > - SVM works along with PT = 1 > - if PT = 1 IOMMU MR are disabled so MR notifier are not registered > - new notifiers do not fit nicely in this framework as they need to be > registered even if PT = 1 > - having those notifiers attached to the whole IOMMU object abstraction > is not an issue > > Please correct me if I said anything wrong and please apologize if I > brought any further confusion. yes, you're right. should have written it in a better manner. Would update the cover-letter accordingly in next version. Also, feel free to let me know if it meets the requirements of the pv work you're doing. Thanks, Yi L > > Thanks > > Eric > > > > Thanks, > > Yi L > >> > >> Thanks > >> > >> Eric > >> > >> If virt-iommu exists, > >>> then the two notfiers are needed, if not, then it's not. > >>> > >>>> Even understanding the SVM stuff better than I did, I don't really see > >>>> why an AddressSpace is an obvious unit to have an IOMMUObject > >>>> associated with it. > >>> > >>> This will benefit the notifier registration. As my comments above, the > >>> IOMMUObject is to represent iommu. Associate an AddressSpace with an > >>> IOMMUObject makes it easy to check if it is necessary to register the > >>> notifiers. > >> If no IOMMUObject, means no virt-iommu exposed to guest, then > >>> no need to register notifiers. For this, I also considered to use the > >>> MemoryRegion.iommu_ops. e.g. for VT-d, it can be a loop to check all the > >>> subregions and register notfiers if it is an iommu MemoryRegion. Peter > >>> mentioned it may not work for SPAR. So he proposed associating an > >>> AddressSpace with an IOMMUObject. I think it wroks and easier, so I > >>> didn’t object it. > >>> > >>> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04931.html > >>> > >>> + /* Check if vIOMMU exists */ > >>> + QTAILQ_FOREACH(subregion, &as->root->subregions, subregions_link) { > >>> + if (memory_region_is_iommu(subregion)) { > >>> + IOMMUNotifier n1; > >>> + > >>> + /* > >>> + FIXME: current iommu notifier is actually designed for > >>> + IOMMUTLB MAP/UNMAP. However, vIOMMU emulator may need > >>> + notifiers other than MAP/UNMAP, so it'll be better to > >>> + split the non-IOMMUTLB notifier from the current IOMMUTLB > >>> + notifier framewrok. > >>> + */ > >>> + iommu_notifier_init(&n1, vfio_iommu_bind_pasid_tbl_notify, > >>> + IOMMU_NOTIFIER_SVM_PASIDT_BIND, > >>> + 0, > >>> + 0); > >>> + vfio_register_notifier(group->container, > >>> + subregion, > >>> + 0, > >>> + &n1); > >>> + } > >>> + } > >>> + > >>> > >>> Thanks, > >>> Yi L > >>> > >>>>> --- > >>>>> hw/core/Makefile.objs | 1 + > >>>>> hw/core/iommu.c | 58 +++++++++++++++++++++++++++++++++++++++ > >>>>> include/exec/memory.h | 22 +++++++++++++++ > >>>>> include/hw/core/iommu.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ > >>>>> memory.c | 10 +++++-- > >>>>> 5 files changed, 162 insertions(+), 2 deletions(-) > >>>>> create mode 100644 hw/core/iommu.c > >>>>> create mode 100644 include/hw/core/iommu.h > >>>>> > >>>>> diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs > >>>>> index f8d7a4a..d688412 100644 > >>>>> --- a/hw/core/Makefile.objs > >>>>> +++ b/hw/core/Makefile.objs > >>>>> @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o > >>>>> # irq.o needed for qdev GPIO handling: > >>>>> common-obj-y += irq.o > >>>>> common-obj-y += hotplug.o > >>>>> +common-obj-y += iommu.o > >>>>> common-obj-y += nmi.o > >>>>> > >>>>> common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o > >>>>> diff --git a/hw/core/iommu.c b/hw/core/iommu.c > >>>>> new file mode 100644 > >>>>> index 0000000..7c4fcfe > >>>>> --- /dev/null > >>>>> +++ b/hw/core/iommu.c > >>>>> @@ -0,0 +1,58 @@ > >>>>> +/* > >>>>> + * QEMU emulation of IOMMU logic > >>>>> + * > >>>>> + * Copyright (C) 2017 Red Hat Inc. > >>>>> + * > >>>>> + * Authors: Peter Xu <peterx@redhat.com>, > >>>>> + * > >>>>> + * This program is free software; you can redistribute it and/or modify > >>>>> + * it under the terms of the GNU General Public License as published by > >>>>> + * the Free Software Foundation; either version 2 of the License, or > >>>>> + * (at your option) any later version. > >>>>> + > >>>>> + * This program is distributed in the hope that it will be useful, > >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>>>> + * GNU General Public License for more details. > >>>>> + > >>>>> + * You should have received a copy of the GNU General Public License along > >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>. > >>>>> + */ > >>>>> + > >>>>> +#include "qemu/osdep.h" > >>>>> +#include "hw/core/iommu.h" > >>>>> + > >>>>> +void iommu_notifier_register(IOMMUObject *iommu, > >>>>> + IOMMUNotifier *n, > >>>>> + IOMMUNotifyFn fn, > >>>>> + IOMMUEvent event) > >>>>> +{ > >>>>> + n->event = event; > >>>>> + n->iommu_notify = fn; > >>>>> + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); > >>>>> + return; > >>>>> +} > >>>>> + > >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu, > >>>>> + IOMMUNotifier *notifier) > >>>>> +{ > >>>>> + IOMMUNotifier *cur, *next; > >>>>> + > >>>>> + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { > >>>>> + if (cur == notifier) { > >>>>> + QLIST_REMOVE(cur, node); > >>>>> + break; > >>>>> + } > >>>>> + } > >>>>> +} > >>>>> + > >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) > >>>>> +{ > >>>>> + IOMMUNotifier *cur; > >>>>> + > >>>>> + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { > >>>>> + if ((cur->event == event_data->event) && cur->iommu_notify) { > >>>>> + cur->iommu_notify(cur, event_data); > >>>>> + } > >>>>> + } > >>>>> +} > >>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h > >>>>> index 03595e3..8350973 100644 > >>>>> --- a/include/exec/memory.h > >>>>> +++ b/include/exec/memory.h > >>>>> @@ -26,6 +26,7 @@ > >>>>> #include "qom/object.h" > >>>>> #include "qemu/rcu.h" > >>>>> #include "hw/qdev-core.h" > >>>>> +#include "hw/core/iommu.h" > >>>>> > >>>>> #define RAM_ADDR_INVALID (~(ram_addr_t)0) > >>>>> > >>>>> @@ -301,6 +302,19 @@ struct MemoryListener { > >>>>> }; > >>>>> > >>>>> /** > >>>>> + * AddressSpaceOps: callbacks structure for address space specific operations > >>>>> + * > >>>>> + * @iommu_get: returns an IOMMU object that backs the address space. > >>>>> + * Normally this should be NULL for generic address > >>>>> + * spaces, and it's only used when there is one > >>>>> + * translation unit behind this address space. > >>>>> + */ > >>>>> +struct AddressSpaceOps { > >>>>> + IOMMUObject *(*iommu_get)(AddressSpace *as); > >>>>> +}; > >>>>> +typedef struct AddressSpaceOps AddressSpaceOps; > >>>>> + > >>>>> +/** > >>>>> * AddressSpace: describes a mapping of addresses to #MemoryRegion objects > >>>>> */ > >>>>> struct AddressSpace { > >>>>> @@ -316,6 +330,7 @@ struct AddressSpace { > >>>>> struct MemoryRegionIoeventfd *ioeventfds; > >>>>> QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; > >>>>> QTAILQ_ENTRY(AddressSpace) address_spaces_link; > >>>>> + AddressSpaceOps as_ops; > >>>>> }; > >>>>> > >>>>> FlatView *address_space_to_flatview(AddressSpace *as); > >>>>> @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, > >>>>> address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); > >>>>> } > >>>>> > >>>>> +/** > >>>>> + * address_space_iommu_get: Get the backend IOMMU for the address space > >>>>> + * > >>>>> + * @as: the address space to fetch IOMMU from > >>>>> + */ > >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as); > >>>>> + > >>>>> #endif > >>>>> > >>>>> #endif > >>>>> diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h > >>>>> new file mode 100644 > >>>>> index 0000000..34387c0 > >>>>> --- /dev/null > >>>>> +++ b/include/hw/core/iommu.h > >>>>> @@ -0,0 +1,73 @@ > >>>>> +/* > >>>>> + * QEMU emulation of IOMMU logic > >>>>> + * > >>>>> + * Copyright (C) 2017 Red Hat Inc. > >>>>> + * > >>>>> + * Authors: Peter Xu <peterx@redhat.com>, > >>>>> + * > >>>>> + * This program is free software; you can redistribute it and/or modify > >>>>> + * it under the terms of the GNU General Public License as published by > >>>>> + * the Free Software Foundation; either version 2 of the License, or > >>>>> + * (at your option) any later version. > >>>>> + > >>>>> + * This program is distributed in the hope that it will be useful, > >>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > >>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>>>> + * GNU General Public License for more details. > >>>>> + > >>>>> + * You should have received a copy of the GNU General Public License along > >>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>. > >>>>> + */ > >>>>> + > >>>>> +#ifndef HW_CORE_IOMMU_H > >>>>> +#define HW_CORE_IOMMU_H > >>>>> + > >>>>> +#include "qemu/queue.h" > >>>>> + > >>>>> +enum IOMMUEvent { > >>>>> + IOMMU_EVENT_BIND_PASIDT, > >>>>> +}; > >>>>> +typedef enum IOMMUEvent IOMMUEvent; > >>>>> + > >>>>> +struct IOMMUEventData { > >>>>> + IOMMUEvent event; > >>>>> + uint64_t length; > >>>>> + void *data; > >>>>> +}; > >>>>> +typedef struct IOMMUEventData IOMMUEventData; > >>>>> + > >>>>> +typedef struct IOMMUNotifier IOMMUNotifier; > >>>>> + > >>>>> +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, > >>>>> + IOMMUEventData *event_data); > >>>>> + > >>>>> +struct IOMMUNotifier { > >>>>> + IOMMUNotifyFn iommu_notify; > >>>>> + /* > >>>>> + * What events we are listening to. Let's allow multiple event > >>>>> + * registrations from beginning. > >>>>> + */ > >>>>> + IOMMUEvent event; > >>>>> + QLIST_ENTRY(IOMMUNotifier) node; > >>>>> +}; > >>>>> + > >>>>> +typedef struct IOMMUObject IOMMUObject; > >>>>> + > >>>>> +/* > >>>>> + * This stands for an IOMMU unit. Any translation device should have > >>>>> + * this struct inside its own structure to make sure it can leverage > >>>>> + * common IOMMU functionalities. > >>>>> + */ > >>>>> +struct IOMMUObject { > >>>>> + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; > >>>>> +}; > >>>>> + > >>>>> +void iommu_notifier_register(IOMMUObject *iommu, > >>>>> + IOMMUNotifier *n, > >>>>> + IOMMUNotifyFn fn, > >>>>> + IOMMUEvent event); > >>>>> +void iommu_notifier_unregister(IOMMUObject *iommu, > >>>>> + IOMMUNotifier *notifier); > >>>>> +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); > >>>>> + > >>>>> +#endif > >>>>> diff --git a/memory.c b/memory.c > >>>>> index 77fb3ef..307f665 100644 > >>>>> --- a/memory.c > >>>>> +++ b/memory.c > >>>>> @@ -235,8 +235,6 @@ struct FlatView { > >>>>> MemoryRegion *root; > >>>>> }; > >>>>> > >>>>> -typedef struct AddressSpaceOps AddressSpaceOps; > >>>>> - > >>>>> #define FOR_EACH_FLAT_RANGE(var, view) \ > >>>>> for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) > >>>>> > >>>>> @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) > >>>>> memory_region_unref(as->root); > >>>>> } > >>>>> > >>>>> +IOMMUObject *address_space_iommu_get(AddressSpace *as) > >>>>> +{ > >>>>> + if (!as->as_ops.iommu_get) { > >>>>> + return NULL; > >>>>> + } > >>>>> + return as->as_ops.iommu_get(as); > >>>>> +} > >>>>> + > >>>>> void address_space_destroy(AddressSpace *as) > >>>>> { > >>>>> MemoryRegion *root = as->root; > >>>> > >>>> -- > >>>> David Gibson | I'll have my music baroque, and my code > >>>> david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > >>>> | _way_ _around_! > >>>> http://www.ozlabs.org/~dgibson > >>> > >>> > >>> > >> > > >
On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: [...] > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 > IOMMU MR and 1 AS per PCIe device, right? I think this is the most tricky point - in QEMU IOMMU MR is not really a 1:1 relationship to devices. For Intel, it's true; for Power, it's not. On Power guests, one device's DMA address space can be splited into different translation windows, while each window corresponds to one IOMMU MR. So IMHO the real 1:1 mapping is between the device and its DMA address space, rather than MRs. It's been a long time since when I drafted the patches. I think at least that should be a more general notifier mechanism comparing to current IOMMUNotifier thing, which was bound to IOTLB notifies only. AFAICT if we want to trap first-level translation changes, current notifier is not even close to that interface - just see the definition of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation addresses, not anything else. And IMHO that's why it's tightly bound to MemoryRegions, and that's the root problem. The dynamic IOMMU MR switching problem is related to this issue as well. I am not sure current "get IOMMU object from address space" solution would be best, maybe it's "too bigger a scope", I think it depends on whether in the future we'll have some requirement in such a bigger scope (say, something we want to trap from vIOMMU and deliver it to host IOMMU which may not even be device-related? I don't know). Now another alternative I am thinking is, whether we can provide a per-device notifier, then it can be bound to PCIDevice rather than MemoryRegions, then it will be in device scope. Thanks,
Hi David, On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > spaces to store arch-specific hooks. > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > IOMMUObject behind the AddressSpace. > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > space per device which is different from system default address > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > Normally when that happens, there will be one specific IOMMU (or > > > > say, translation unit) stands right behind that new address space. > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > on MemoryRegion. > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > discussion in China. I've been sick several times and very busy. > > > > > > I still don't feel like there's an adequate explanation of exactly > > > what an IOMMUObject represents. Obviously it can represent more than > > > a single translation window - since that's represented by the > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > are represented by the IOMMUObject have in common, from a functional > > > point of view. > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > associated with it. > > > > Here's what I thought about it: IOMMUObject was planned to be the > > abstraction of the hardware translation unit, which is a higher level > > of the translated address spaces. Say, for each PCI device, it can > > have its own translated address space. However for multiple PCI > > devices, they can be sharing the same translation unit that handles > > the translation requests from different devices. That's the case for > > Intel platforms. We introduced this IOMMUObject because sometimes we > > want to do something with that translation unit rather than a specific > > device, in which we need a general IOMMU device handle. > > Ok, but what does "hardware translation unit" mean in practice. The > guest neither knows nor cares, which bits of IOMMU translation happen > to be included in the same bundle of silicon. It only cares what the > behaviour is. What behavioural characteristics does a single > IOMMUObject have? > > > IIRC one issue left over during last time's discussion was that there > > could be more complicated IOMMU models. E.g., one device's DMA request > > can be translated nestedly by two or multiple IOMMUs, and current > > proposal cannot really handle that complicated hierachy. I'm just > > thinking whether we can start from a simple model (say, we don't allow > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > far), then we can evolve from that point in the future. > > > > Also, I thought there were something you mentioned that this approach > > is not correct for Power systems, but I can't really remember the > > details... Anyways, I think this is not the only approach to solve > > the problem, and I believe any new better idea would be greatly > > welcomed as well. :) > > So, some of my initial comments were based on a misunderstanding of > what was proposed here - since discussing this with Yi at LinuxCon > Beijing, I have a better idea of what's going on. > > On POWER - or rather the "pseries" platform, which is paravirtualized. > We can have multiple vIOMMU windows (usually 2) for a single virtual On POWER, the DMA isolation is done by allocating different DMA window to different isolation domains? And a single isolation domain may include multiple dma windows? So with or withou IOMMU, there is only a single DMA address shared by all the devices in the system? The isolation mechanism is as what described above? > PCI host bridge. Because of the paravirtualization, the mapping to > hardware is fuzzy, but for passthrough devices they will both be > implemented by the IOMMU built into the physical host bridge. That > isn't importat to the guest, though - all operations happen at the > window level. On VT-d, with IOMMU presented, each isolation domain has its own address space. That's why we talked more on address space level. And iommu makes the difference. That's the behavioural characteristics a single iommu translation unit has. And thus an IOMMUObject going to have. > > The other thing that bothers me here is the way it's attached to an > AddressSpace. My consideration is iommu handles AddressSpaces. dma address space is also an address space managed by iommu. That's why we believe it is fine to associate dma address space with an IOMMUObject. > IIUC how SVM works, the whole point is that the device > no longer writes into a specific PCI address space. Instead, it > writes directly into a process address space. So it seems to me more > that SVM should operate at the PCI level, and disassociate the device > from the normal PCI address space entirely, rather than hooking up > something via that address space. As Peter replied, we still need the PCI address space, it would be used to build up the 2nd level page table which would be used in nested translation. Thanks, Yi L > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
Sorry I've taken so long to reply, I've been super busy with other things. On Tue, Nov 14, 2017 at 11:31:00AM +0800, Peter Xu wrote: > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > > spaces to store arch-specific hooks. > > > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > > IOMMUObject behind the AddressSpace. > > > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > > space per device which is different from system default address > > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > > Normally when that happens, there will be one specific IOMMU (or > > > > > say, translation unit) stands right behind that new address space. > > > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > > on MemoryRegion. > > > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > > discussion in China. I've been sick several times and very busy. > > > > > > > > I still don't feel like there's an adequate explanation of exactly > > > > what an IOMMUObject represents. Obviously it can represent more than > > > > a single translation window - since that's represented by the > > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > > are represented by the IOMMUObject have in common, from a functional > > > > point of view. > > > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > > associated with it. > > > > > > Here's what I thought about it: IOMMUObject was planned to be the > > > abstraction of the hardware translation unit, which is a higher level > > > of the translated address spaces. Say, for each PCI device, it can > > > have its own translated address space. However for multiple PCI > > > devices, they can be sharing the same translation unit that handles > > > the translation requests from different devices. That's the case for > > > Intel platforms. We introduced this IOMMUObject because sometimes we > > > want to do something with that translation unit rather than a specific > > > device, in which we need a general IOMMU device handle. > > > > Ok, but what does "hardware translation unit" mean in practice. The > > guest neither knows nor cares, which bits of IOMMU translation happen > > to be included in the same bundle of silicon. It only cares what the > > behaviour is. What behavioural characteristics does a single > > IOMMUObject have? > > In VT-d (I believe the same to ARM SMMUs), IMHO the special thing is > that the translation windows (and device address spaces in QEMU) are > only talking about second level translations, but not first level, > while virt-svm needs to play with first level translations. Until > now, AFAIU we don't really have a good interface for first level > translations at all (aka. the process address space). Ok, that explains why you need some kind of different model than the existing one, but that wasn't really disputed. It still doesn't say why separating the unclearly defined IOMMUObject from the IOMMU MR is a good way of modelling this. If the issue is 1st vs. 2nd level translation, it really seems you should be explicitly modelling the different 1st level vs. 2nd level address spaces, rather than just splitting functions between the MR and AS level with no clear rationale behind what goes where. > > > IIRC one issue left over during last time's discussion was that there > > > could be more complicated IOMMU models. E.g., one device's DMA request > > > can be translated nestedly by two or multiple IOMMUs, and current > > > proposal cannot really handle that complicated hierachy. I'm just > > > thinking whether we can start from a simple model (say, we don't allow > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > > far), then we can evolve from that point in the future. > > > > > > Also, I thought there were something you mentioned that this approach > > > is not correct for Power systems, but I can't really remember the > > > details... Anyways, I think this is not the only approach to solve > > > the problem, and I believe any new better idea would be greatly > > > welcomed as well. :) > > > > So, some of my initial comments were based on a misunderstanding of > > what was proposed here - since discussing this with Yi at LinuxCon > > Beijing, I have a better idea of what's going on. > > > > On POWER - or rather the "pseries" platform, which is paravirtualized. > > We can have multiple vIOMMU windows (usually 2) for a single virtual > > PCI host bridge. Because of the paravirtualization, the mapping to > > hardware is fuzzy, but for passthrough devices they will both be > > implemented by the IOMMU built into the physical host bridge. That > > isn't importat to the guest, though - all operations happen at the > > window level. > > Now I know that for Power it may not have anything like a "translation > unit" but everything is defined as "translation windows" in the > guests. However the problem still exist for some other platforms. > Say, for Intel we have emulated VT-d; for ARM, we have vSMMU. AFAIU > these platforms do have their translation units, and even for ARM it > should need such an interface (or any better interfaces, which are > always welcomed) for virt-svm to work. Otherwise I don't know a way > to configure the first level translation tables. > > Meanwhile, IMO this abstraction should not really affect pseries - it > should be only useful for those platforms who would like to use it. > For pseries, we can just ignore that new interface if we don't really > even have such a translation unit. But it's _still_ not clear to me what a "translation unit" means. What is common across a translation unit that is not common across different translation units? > > > > > The other thing that bothers me here is the way it's attached to an > > AddressSpace. IIUC how SVM works, the whole point is that the device > > no longer writes into a specific PCI address space. Instead, it > > writes directly into a process address space. So it seems to me more > > that SVM should operate at the PCI level, and disassociate the device > > from the normal PCI address space entirely, rather than hooking up > > something via that address space. > > IMO the PCI address space is still used. For virt-svm, host IOMMU > will be working in nested translation mode, so we should be having two > mappings working in parallel: > > 1. DPDK process (in guest) address space mapping (GVA -> GPA) > 2. guest direct memory mappings (GPA -> HPA) > > And here AFAIU the 2nd mapping is working exactly like general PCI > devices, the only difference is that the 2nd level mapping is always > static, just like when IOMMU passthrough is enabled for that device. Ok. IIUC the 2nd level mapping isn't visible to the guest, is that right? But this doesn't really affect my point - from the guest's point of view, the "usual" address space for a PCI device is equal to the GPA (i.e. no guest visible translation), but for SVM devices they instead use an SVM address space depending on the process id, which is not the same as the GPA space. > So, IMHO virt-SVM is not really in parallel with PCI subsystem. Well, it's not clear to me if it's in parallel to, or on top of. But the crucial point is that an SVM device does not access the "main" PCI address space (whether or not that has a traditional IOMMU). Instead it has access to a bunch of different address spaces, one for each process ID. > For > the SVM in guest, it may be different, since it should only be using > first level translations. However to implement virt-SVM, IMHO we not > only need existing PCI address space translation logic, we also need > an extra way to configure the first level mappings, as discussed. Right. I'm not disputing that a way to configure those first level mappings is necessary. The proposed model just doesn't seem a good fit. It still only represents a single "PCI address space", then attaches things about the process table mapping to that, when in fact they affect the process table and therefore *different* address spaces from that which the methods are attached to.
On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: > Hi David, > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > > spaces to store arch-specific hooks. > > > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > > IOMMUObject behind the AddressSpace. > > > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > > space per device which is different from system default address > > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > > Normally when that happens, there will be one specific IOMMU (or > > > > > say, translation unit) stands right behind that new address space. > > > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > > on MemoryRegion. > > > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > > discussion in China. I've been sick several times and very busy. > > > > > > > > I still don't feel like there's an adequate explanation of exactly > > > > what an IOMMUObject represents. Obviously it can represent more than > > > > a single translation window - since that's represented by the > > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > > are represented by the IOMMUObject have in common, from a functional > > > > point of view. > > > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > > associated with it. > > > > > > Here's what I thought about it: IOMMUObject was planned to be the > > > abstraction of the hardware translation unit, which is a higher level > > > of the translated address spaces. Say, for each PCI device, it can > > > have its own translated address space. However for multiple PCI > > > devices, they can be sharing the same translation unit that handles > > > the translation requests from different devices. That's the case for > > > Intel platforms. We introduced this IOMMUObject because sometimes we > > > want to do something with that translation unit rather than a specific > > > device, in which we need a general IOMMU device handle. > > > > Ok, but what does "hardware translation unit" mean in practice. The > > guest neither knows nor cares, which bits of IOMMU translation happen > > to be included in the same bundle of silicon. It only cares what the > > behaviour is. What behavioural characteristics does a single > > IOMMUObject have? > > > > > IIRC one issue left over during last time's discussion was that there > > > could be more complicated IOMMU models. E.g., one device's DMA request > > > can be translated nestedly by two or multiple IOMMUs, and current > > > proposal cannot really handle that complicated hierachy. I'm just > > > thinking whether we can start from a simple model (say, we don't allow > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > > far), then we can evolve from that point in the future. > > > > > > Also, I thought there were something you mentioned that this approach > > > is not correct for Power systems, but I can't really remember the > > > details... Anyways, I think this is not the only approach to solve > > > the problem, and I believe any new better idea would be greatly > > > welcomed as well. :) > > > > So, some of my initial comments were based on a misunderstanding of > > what was proposed here - since discussing this with Yi at LinuxCon > > Beijing, I have a better idea of what's going on. > > > > On POWER - or rather the "pseries" platform, which is paravirtualized. > > We can have multiple vIOMMU windows (usually 2) for a single virtual > > On POWER, the DMA isolation is done by allocating different DMA window > to different isolation domains? And a single isolation domain may include > multiple dma windows? So with or withou IOMMU, there is only a single > DMA address shared by all the devices in the system? The isolation > mechanism is as what described above? No, the multiple windows are completely unrelated to how things are isolated. Just like on x86, each IOMMU domain has independent IOMMU mappings. The only difference is that IBM calls the domains "partitionable endpoints" (PEs) and they tend to be statically created at boot time, rather than runtime generated. The windows are about what addresses in PCI space are translated by the IOMMU. If the device generates a PCI cycle, only certain addresses will be mapped by the IOMMU to DMA - other addresses will correspond to other devices MMIOs, MSI vectors, maybe other things. The set of addresses translated by the IOMMU need not be contiguous. Or, there could be two IOMMUs on the bus, each accepting different address ranges. These two situations are not distinguishable from the guest's point of view. So for a typical PAPR setup, the device can access system RAM either via DMA in the range 0..1GiB (the "32-bit window") or in the range 2^59..2^59+<something> (the "64-bit window"). Typically the 32-bit window has mappings dynamically created by drivers, and the 64-bit window has all of system RAM mapped 1:1, but that's entirely up to the OS, it can map each window however it wants. 32-bit devices (or "64 bit" devices which don't actually implement enough the address bits) will only be able to use the 32-bit window, of course. MMIOs of other devices, the "magic" MSI-X addresses belonging to the host bridge and other things exist outside those ranges. Those are just the ranges which are used to DMA to RAM. Each PE (domain) can see a different version of what's in each window. In fact, if I understand the "IO hole" correctly, the situation on x86 isn't very different. It has a window below the IO hole and a second window above the IO hole. The addresses within the IO hole go to (32-bit) devices on the PCI bus, rather than being translated by the IOMMU to RAM addresses. Because the gap is smaller between the two windows, I think we get away without really modelling this detail in qemu though. > > PCI host bridge. Because of the paravirtualization, the mapping to > > hardware is fuzzy, but for passthrough devices they will both be > > implemented by the IOMMU built into the physical host bridge. That > > isn't importat to the guest, though - all operations happen at the > > window level. > > On VT-d, with IOMMU presented, each isolation domain has its own address > space. That's why we talked more on address space level. And iommu makes > the difference. That's the behavioural characteristics a single iommu > translation unit has. And thus an IOMMUObject going to have. Right, that's the same on POWER. But the IOMMU only translates *some* addresses within the address space, not all of them. The rest will go to other PCI devices or be unmapped, but won't go to RAM. That's why the IOMMU should really be associated with an MR (or several MRs), not an AddressSpace, it only translates some addresses. > > The other thing that bothers me here is the way it's attached to an > > AddressSpace. > > My consideration is iommu handles AddressSpaces. dma address space is also > an address space managed by iommu. No, it's not. It's a region (or several) within the overall PCI address space. Other things in the addressspace, such as other device's BARs exist independent of the IOMMU. It's not something that could really work with PCI-E, I think, but with a more traditional PCI bus there's no reason you couldn't have multiple IOMMUs listening on different regions of the PCI address space. > That's why we believe it is fine to > associate dma address space with an IOMMUObject. > > IIUC how SVM works, the whole point is that the device > > no longer writes into a specific PCI address space. Instead, it > > writes directly into a process address space. So it seems to me more > > that SVM should operate at the PCI level, and disassociate the device > > from the normal PCI address space entirely, rather than hooking up > > something via that address space. > > As Peter replied, we still need the PCI address space, it would be used > to build up the 2nd level page table which would be used in nested > translation. > > Thanks, > Yi L > > > >
On Tue, Nov 14, 2017 at 09:53:07AM +0100, Auger Eric wrote: > Hi Yi L, > > On 13/11/2017 10:58, Liu, Yi L wrote: > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > >> On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > >>> From: Peter Xu <peterx@redhat.com> > >>> > >>> AddressSpaceOps is similar to MemoryRegionOps, it's just for address > >>> spaces to store arch-specific hooks. > >>> > >>> The first hook I would like to introduce is iommu_get(). Return an > >>> IOMMUObject behind the AddressSpace. > >>> > >>> For systems that have IOMMUs, we will create a special address > >>> space per device which is different from system default address > >>> space for it (please refer to pci_device_iommu_address_space()). > >>> Normally when that happens, there will be one specific IOMMU (or > >>> say, translation unit) stands right behind that new address space. > >>> > >>> This iommu_get() fetches that guy behind the address space. Here, > >>> the guy is defined as IOMMUObject, which includes a notifier_list > >>> so far, may extend in future. Along with IOMMUObject, a new iommu > >>> notifier mechanism is introduced. It would be used for virt-svm. > >>> Also IOMMUObject can further have a IOMMUObjectOps which is similar > >>> to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > >>> on MemoryRegion. > >>> > >>> Signed-off-by: Peter Xu <peterx@redhat.com> > >>> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > >> > >> Hi, sorry I didn't reply to the earlier postings of this after our > >> discussion in China. I've been sick several times and very busy. > > > > Hi David, > > > > Fully understood. I'll try my best to address your question. Also, > > feel free to input further questions, anyhow, the more we discuss the > > better work we done. > > > >> I still don't feel like there's an adequate explanation of exactly > >> what an IOMMUObject represents. Obviously it can represent more than > > > > IOMMUObject is aimed to represent the iommu itself. e.g. the iommu > > specific operations. One of the key purpose of IOMMUObject is to > > introduce a notifier framework to let iommu emulator to be able to > > do iommu operations other than MAP/UNMAP. As IOMMU grows more and > > more feature, MAP/UNMAP is not the only operation iommu emulator needs > > to deal. e.g. shared virtual memory. So far, as I know AMD/ARM also > > has it. may correct me on it. As my cover letter mentioned, MR based > > notifier framework doesn’t work for the newly added IOMMU operations. > > Like bind guest pasid table pointer to host and propagate guest's > > iotlb flush to host. > > > >> a single translation window - since that's represented by the > >> IOMMUMR. But what exactly do all the MRs - or whatever else - that > >> are represented by the IOMMUObject have in common, from a functional > >> point of view. > > > > Let me take virt-SVM as an example. As far as I know, for virt-SVM, > > the implementation of different vendors are similar. The key design > > is to have a nested translation(aka. two stage translation). It is to > > have guest maintain gVA->gPA mapping and hypervisor builds gPA->hPA > > mapping. Similar to EPT based virt-MMU solution. > > > > In Qemu, gPA->hPA mapping is done through MAP/UNMAP notifier, it can > > keep going. But for gVA->gPA mapping, only guest knows it, so hypervisor > > needs to trap specific guest iommu operation and pass the gVA->gPA > > mapping knowledge to host through a notifier(newly added one). In VT-d, > > it is called bind guest pasid table to host. > > What I don't get is the PASID table is per extended context entry. I > understand this latter is indexed by PCI device function. And today MR > are created per PCIe device if I am not wrong. So why can't we have 1 > new MR notifier dedicated to PASID table passing? My understanding is > the MR, having a 1-1 correspondence with a PCIe device and thus a > context could be of right granularity. Not really. The MR(s) and AS is created per a group of devices which will always see the same mappings. On Intel that's the IOMMU domain. On PAPR that's a partitionable-endpoint - except that we choose to only have one PE per guest host bridge (but multiple host bridges is standard for POWER). There's a qemu hook to get the right AS for a device, which takes the devfn as a parameter. Depending on the host bridge implementation, though, it won't necessary return a different AS for every device though. > Then I understand the only flags > we currently have are NONE, MAP and UNMAP but couldn't we add a new one > for PASID TABLE passing? So this is not crystal clear to me why MR > notifiers are not adapted to PASID table passing. Right, to me either. Things get more complicated if both the 1st level (per PASID) and 2nd level translations (per PCI RID) are visible to the guest. Having level 1 owned by the guest and 2nd level owned by the host is the typical mode of operation, but if we want to model bare metal machines we do need to handle the case of both. Similarly, implementing virt-SVM can't go and break our modelling of "traditional" non-PASID aware IOMMUs. Those are not usually present in x86 guests, although they can be, and they are *always* present for PAPR guests. > > Also, for the gVA iotlb flushing, only guest knows it. So hypervisor > > needs to propagate it to host. Here, MAP/UNMAP is not suitable since > > this gVA iotlb flush here doesn’t require to modify host iommu > > translation table. > I don't really get this argument. IOMMUNotifier just is a notifier that > is attached to an IOMMU MR and calls a an IOMMUNotify function, right? > Then the role of the function currently is attached to the currently > existing flags, MAP, UNMAP. This is not linked to an action on the > physical IOMMU, right? Maybe, maybe not. In the case of emulated devices, it need not touch the host MMU. However, for the case of VFIO devices, we need to mirror mappings in the guest IOMMU to the host IOMMU.
On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote: > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: > > Hi David, > > > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > > > spaces to store arch-specific hooks. > > > > > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > > > IOMMUObject behind the AddressSpace. > > > > > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > > > space per device which is different from system default address > > > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > > > Normally when that happens, there will be one specific IOMMU (or > > > > > > say, translation unit) stands right behind that new address space. > > > > > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > > > on MemoryRegion. > > > > > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > > > discussion in China. I've been sick several times and very busy. > > > > > > > > > > I still don't feel like there's an adequate explanation of exactly > > > > > what an IOMMUObject represents. Obviously it can represent more than > > > > > a single translation window - since that's represented by the > > > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > > > are represented by the IOMMUObject have in common, from a functional > > > > > point of view. > > > > > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > > > associated with it. > > > > > > > > Here's what I thought about it: IOMMUObject was planned to be the > > > > abstraction of the hardware translation unit, which is a higher level > > > > of the translated address spaces. Say, for each PCI device, it can > > > > have its own translated address space. However for multiple PCI > > > > devices, they can be sharing the same translation unit that handles > > > > the translation requests from different devices. That's the case for > > > > Intel platforms. We introduced this IOMMUObject because sometimes we > > > > want to do something with that translation unit rather than a specific > > > > device, in which we need a general IOMMU device handle. > > > > > > Ok, but what does "hardware translation unit" mean in practice. The > > > guest neither knows nor cares, which bits of IOMMU translation happen > > > to be included in the same bundle of silicon. It only cares what the > > > behaviour is. What behavioural characteristics does a single > > > IOMMUObject have? > > > > > > > IIRC one issue left over during last time's discussion was that there > > > > could be more complicated IOMMU models. E.g., one device's DMA request > > > > can be translated nestedly by two or multiple IOMMUs, and current > > > > proposal cannot really handle that complicated hierachy. I'm just > > > > thinking whether we can start from a simple model (say, we don't allow > > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > > > far), then we can evolve from that point in the future. > > > > > > > > Also, I thought there were something you mentioned that this approach > > > > is not correct for Power systems, but I can't really remember the > > > > details... Anyways, I think this is not the only approach to solve > > > > the problem, and I believe any new better idea would be greatly > > > > welcomed as well. :) > > > > > > So, some of my initial comments were based on a misunderstanding of > > > what was proposed here - since discussing this with Yi at LinuxCon > > > Beijing, I have a better idea of what's going on. > > > > > > On POWER - or rather the "pseries" platform, which is paravirtualized. > > > We can have multiple vIOMMU windows (usually 2) for a single virtual > > > > On POWER, the DMA isolation is done by allocating different DMA window > > to different isolation domains? And a single isolation domain may include > > multiple dma windows? So with or withou IOMMU, there is only a single > > DMA address shared by all the devices in the system? The isolation > > mechanism is as what described above? > > No, the multiple windows are completely unrelated to how things are > isolated. I'm afraid I chose a wrong word by using "DMA window".. Actually, when mentioning "DMA window", I mean address ranges in an iova address space. Anyhow, let me re-shape my understanding of POWER IOMMU and make sure we are in the same page. > > Just like on x86, each IOMMU domain has independent IOMMU mappings. > The only difference is that IBM calls the domains "partitionable > endpoints" (PEs) and they tend to be statically created at boot time, > rather than runtime generated. Does POWER IOMMU also have iova concept? Device can use an iova to access memory, and IOMMU translates the iova to an address within the system physical address? > > The windows are about what addresses in PCI space are translated by > the IOMMU. If the device generates a PCI cycle, only certain > addresses will be mapped by the IOMMU to DMA - other addresses will > correspond to other devices MMIOs, MSI vectors, maybe other things. I guess the windows you mentioned here is the address ranges within the system physical address space as you also mentioned MMIOs and etc. > The set of addresses translated by the IOMMU need not be contiguous. I suppose you mean the output addresses of the IOMMU need not be contiguous? > Or, there could be two IOMMUs on the bus, each accepting different > address ranges. These two situations are not distinguishable from the > guest's point of view. > > So for a typical PAPR setup, the device can access system RAM either > via DMA in the range 0..1GiB (the "32-bit window") or in the range > 2^59..2^59+<something> (the "64-bit window"). Typically the 32-bit > window has mappings dynamically created by drivers, and the 64-bit > window has all of system RAM mapped 1:1, but that's entirely up to the > OS, it can map each window however it wants. > > 32-bit devices (or "64 bit" devices which don't actually implement > enough the address bits) will only be able to use the 32-bit window, > of course. > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the > host bridge and other things exist outside those ranges. Those are > just the ranges which are used to DMA to RAM. > > Each PE (domain) can see a different version of what's in each > window. If I'm correct so far. PE actually defines a mapping between an address range of an address space(aka. iova address space) and an address range of the system physical address space. Then my question is: does each PE define a separate iova address sapce which is flat from 0 - 2^AW -1, AW is address width? As a reference, VT-d domain defines a flat address space for each domain. > > In fact, if I understand the "IO hole" correctly, the situation on x86 > isn't very different. It has a window below the IO hole and a second > window above the IO hole. The addresses within the IO hole go to > (32-bit) devices on the PCI bus, rather than being translated by the If you mean the "IO hole" within the system physcial address space, I think it's yes. > IOMMU to RAM addresses. Because the gap is smaller between the two > windows, I think we get away without really modelling this detail in > qemu though. > > > > PCI host bridge. Because of the paravirtualization, the mapping to > > > hardware is fuzzy, but for passthrough devices they will both be > > > implemented by the IOMMU built into the physical host bridge. That > > > isn't importat to the guest, though - all operations happen at the > > > window level. > > > > On VT-d, with IOMMU presented, each isolation domain has its own address > > space. That's why we talked more on address space level. And iommu makes > > the difference. That's the behavioural characteristics a single iommu > > translation unit has. And thus an IOMMUObject going to have. > > Right, that's the same on POWER. But the IOMMU only translates *some* > addresses within the address space, not all of them. The rest will go > to other PCI devices or be unmapped, but won't go to RAM. > > That's why the IOMMU should really be associated with an MR (or > several MRs), not an AddressSpace, it only translates some addresses. If I'm correct so far, I do believe the major difference between VT-d and POWER IOMMU is that VT-d isolation domain is a flat address space while PE of POWER is something different(need your input here as I'm not sure about it). Maybe it's like there is a flat address space, each PE takes some address ranges and maps the address ranges to different system physcial address ranges. > > > > The other thing that bothers me here is the way it's attached to an > > > AddressSpace. > > > > My consideration is iommu handles AddressSpaces. dma address space is also > > an address space managed by iommu. > > No, it's not. It's a region (or several) within the overall PCI > address space. Other things in the addressspace, such as other > device's BARs exist independent of the IOMMU. > > It's not something that could really work with PCI-E, I think, but > with a more traditional PCI bus there's no reason you couldn't have > multiple IOMMUs listening on different regions of the PCI address > space. I think the point here is on POWER, the input addresses of IOMMUs are actaully in the same address space? What IOMMU does is mapping the different ranges to different system physcial address ranges. So it's as you mentioned, multiple IOMMUs listen on different regions of a PCI address space. While for VT-d, it's not the case. The input addresses of IOMMUs may not in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a separate address space. So for VT-d, IOMMUs are actually listening to different address spaces. That's the point why we want to have address space level abstraction of IOMMU. > > > That's why we believe it is fine to > > associate dma address space with an IOMMUObject. > > > > IIUC how SVM works, the whole point is that the device > > > no longer writes into a specific PCI address space. Instead, it > > > writes directly into a process address space. So it seems to me more > > > that SVM should operate at the PCI level, and disassociate the device > > > from the normal PCI address space entirely, rather than hooking up > > > something via that address space. > > > > As Peter replied, we still need the PCI address space, it would be used > > to build up the 2nd level page table which would be used in nested > > translation. > > > > Thanks, > > Yi L > > > > > > > > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson Regards, Yi L
On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote: > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote: > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: > > > Hi David, > > > > > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > > > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > > > > spaces to store arch-specific hooks. > > > > > > > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > > > > IOMMUObject behind the AddressSpace. > > > > > > > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > > > > space per device which is different from system default address > > > > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > > > > Normally when that happens, there will be one specific IOMMU (or > > > > > > > say, translation unit) stands right behind that new address space. > > > > > > > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > > > > on MemoryRegion. > > > > > > > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > > > > discussion in China. I've been sick several times and very busy. > > > > > > > > > > > > I still don't feel like there's an adequate explanation of exactly > > > > > > what an IOMMUObject represents. Obviously it can represent more than > > > > > > a single translation window - since that's represented by the > > > > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > > > > are represented by the IOMMUObject have in common, from a functional > > > > > > point of view. > > > > > > > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > > > > associated with it. > > > > > > > > > > Here's what I thought about it: IOMMUObject was planned to be the > > > > > abstraction of the hardware translation unit, which is a higher level > > > > > of the translated address spaces. Say, for each PCI device, it can > > > > > have its own translated address space. However for multiple PCI > > > > > devices, they can be sharing the same translation unit that handles > > > > > the translation requests from different devices. That's the case for > > > > > Intel platforms. We introduced this IOMMUObject because sometimes we > > > > > want to do something with that translation unit rather than a specific > > > > > device, in which we need a general IOMMU device handle. > > > > > > > > Ok, but what does "hardware translation unit" mean in practice. The > > > > guest neither knows nor cares, which bits of IOMMU translation happen > > > > to be included in the same bundle of silicon. It only cares what the > > > > behaviour is. What behavioural characteristics does a single > > > > IOMMUObject have? > > > > > > > > > IIRC one issue left over during last time's discussion was that there > > > > > could be more complicated IOMMU models. E.g., one device's DMA request > > > > > can be translated nestedly by two or multiple IOMMUs, and current > > > > > proposal cannot really handle that complicated hierachy. I'm just > > > > > thinking whether we can start from a simple model (say, we don't allow > > > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > > > > far), then we can evolve from that point in the future. > > > > > > > > > > Also, I thought there were something you mentioned that this approach > > > > > is not correct for Power systems, but I can't really remember the > > > > > details... Anyways, I think this is not the only approach to solve > > > > > the problem, and I believe any new better idea would be greatly > > > > > welcomed as well. :) > > > > > > > > So, some of my initial comments were based on a misunderstanding of > > > > what was proposed here - since discussing this with Yi at LinuxCon > > > > Beijing, I have a better idea of what's going on. > > > > > > > > On POWER - or rather the "pseries" platform, which is paravirtualized. > > > > We can have multiple vIOMMU windows (usually 2) for a single virtual > > > > > > On POWER, the DMA isolation is done by allocating different DMA window > > > to different isolation domains? And a single isolation domain may include > > > multiple dma windows? So with or withou IOMMU, there is only a single > > > DMA address shared by all the devices in the system? The isolation > > > mechanism is as what described above? > > > > No, the multiple windows are completely unrelated to how things are > > isolated. > > I'm afraid I chose a wrong word by using "DMA window".. > Actually, when mentioning "DMA window", I mean address ranges in an iova > address space. Yes, so did I. My one window I mean one contiguous range of IOVA addresses. > Anyhow, let me re-shape my understanding of POWER IOMMU and > make sure we are in the same page. > > > > > Just like on x86, each IOMMU domain has independent IOMMU mappings. > > The only difference is that IBM calls the domains "partitionable > > endpoints" (PEs) and they tend to be statically created at boot time, > > rather than runtime generated. > > Does POWER IOMMU also have iova concept? Device can use an iova to > access memory, and IOMMU translates the iova to an address within the > system physical address? Yes. When I say the "PCI address space" I mean the IOVA space. > > The windows are about what addresses in PCI space are translated by > > the IOMMU. If the device generates a PCI cycle, only certain > > addresses will be mapped by the IOMMU to DMA - other addresses will > > correspond to other devices MMIOs, MSI vectors, maybe other things. > > I guess the windows you mentioned here is the address ranges within the > system physical address space as you also mentioned MMIOs and etc. No. I mean ranges within the PCI space == IOVA space. It's simplest to understand with traditional PCI. A cycle on the bus doesn't know whether the destination is a device or memory, it just has an address - a PCI memory address. Part of that address range is mapped to system RAM, optionally with an IOMMU translating it. Other parts of that address space are used for devices. With PCI-E things get more complicated, but the conceptual model is the same. > > The set of addresses translated by the IOMMU need not be contiguous. > > I suppose you mean the output addresses of the IOMMU need not be > contiguous? No. I mean the input addresses of the IOMMU. > > Or, there could be two IOMMUs on the bus, each accepting different > > address ranges. These two situations are not distinguishable from the > > guest's point of view. > > > > So for a typical PAPR setup, the device can access system RAM either > > via DMA in the range 0..1GiB (the "32-bit window") or in the range > > 2^59..2^59+<something> (the "64-bit window"). Typically the 32-bit > > window has mappings dynamically created by drivers, and the 64-bit > > window has all of system RAM mapped 1:1, but that's entirely up to the > > OS, it can map each window however it wants. > > > > 32-bit devices (or "64 bit" devices which don't actually implement > > enough the address bits) will only be able to use the 32-bit window, > > of course. > > > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the > > host bridge and other things exist outside those ranges. Those are > > just the ranges which are used to DMA to RAM. > > > > Each PE (domain) can see a different version of what's in each > > window. > > If I'm correct so far. PE actually defines a mapping between an address > range of an address space(aka. iova address space) and an address range > of the system physical address space. No. A PE means several things, but basically it is an isolation domain, like an Intel IOMMU domain. Each PE has an independent set of IOMMU mappings which translate part of the PCI address space to system memory space. > Then my question is: does each PE define a separate iova address sapce > which is flat from 0 - 2^AW -1, AW is address width? As a reference, > VT-d domain defines a flat address space for each domain. Partly. Each PE has an address space which all devices in the PE see. Only some of that address space is mapped to system memory though, other parts are occupied by devices, others are unmapped. Only the parts mapped by the IOMMU vary between PEs - the other parts of the address space will be identical for all PEs on the host bridge. However for POWER guests (not for hosts) there is exactly one PE for each virtual host bridge. > > In fact, if I understand the "IO hole" correctly, the situation on x86 > > isn't very different. It has a window below the IO hole and a second > > window above the IO hole. The addresses within the IO hole go to > > (32-bit) devices on the PCI bus, rather than being translated by the > > If you mean the "IO hole" within the system physcial address space, I think > it's yes. Well, really I mean the IO hole in PCI address space. Because system address space and PCI memory space were traditionally identity mapped on x86 this is easy to confuse though. > > IOMMU to RAM addresses. Because the gap is smaller between the two > > windows, I think we get away without really modelling this detail in > > qemu though. > > > > > > PCI host bridge. Because of the paravirtualization, the mapping to > > > > hardware is fuzzy, but for passthrough devices they will both be > > > > implemented by the IOMMU built into the physical host bridge. That > > > > isn't importat to the guest, though - all operations happen at the > > > > window level. > > > > > > On VT-d, with IOMMU presented, each isolation domain has its own address > > > space. That's why we talked more on address space level. And iommu makes > > > the difference. That's the behavioural characteristics a single iommu > > > translation unit has. And thus an IOMMUObject going to have. > > > > Right, that's the same on POWER. But the IOMMU only translates *some* > > addresses within the address space, not all of them. The rest will go > > to other PCI devices or be unmapped, but won't go to RAM. > > > > That's why the IOMMU should really be associated with an MR (or > > several MRs), not an AddressSpace, it only translates some addresses. > > If I'm correct so far, I do believe the major difference between VT-d and > POWER IOMMU is that VT-d isolation domain is a flat address space while > PE of POWER is something different(need your input here as I'm not sure about > it). Maybe it's like there is a flat address space, each PE takes some address > ranges and maps the address ranges to different system physcial address ranges. No, it's really not that different. In both cases (without virt-SVM) there's a system memory address space, and a PCI address space for each domain / PE. There are one or more "outbound" windows in system memory space that map system memory cycles to PCI cycles (used by the CPU to access MMIO) and one or more "inbound" (DMA) windows in PCI memory space which map PCI cycles onto system memory cycles (used by devices to access system memory). On old-style PCs, both inbound and outbound windows were (mostly) identity maps. On POWER they are not. > > > > The other thing that bothers me here is the way it's attached to an > > > > AddressSpace. > > > > > > My consideration is iommu handles AddressSpaces. dma address space is also > > > an address space managed by iommu. > > > > No, it's not. It's a region (or several) within the overall PCI > > address space. Other things in the addressspace, such as other > > device's BARs exist independent of the IOMMU. > > > > It's not something that could really work with PCI-E, I think, but > > with a more traditional PCI bus there's no reason you couldn't have > > multiple IOMMUs listening on different regions of the PCI address > > space. > > I think the point here is on POWER, the input addresses of IOMMUs are actaully > in the same address space? I'm not sure what you mean, but I don't think so. Each PE has its own IOMMU input address space. > What IOMMU does is mapping the different ranges to > different system physcial address ranges. So it's as you mentioned, multiple > IOMMUs listen on different regions of a PCI address space. No. That could be the case in theory, but it's not the usual case. Or rather it depends what you mean by "an IOMMU". For PAPR guests, both IOVA 0..1GiB and 2^59..(somewhere) are mapped to system memory, but with separate page tables. You could consider that two IOMMUs (we mostly treat it that way in qemu). However, all the mapping is handled by the same host bridge with 2 sets of page tables per PE, so you could also call it one IOMMU. This is what I'm getting at when I say that "one IOMMU" is not a clearly defined unit. > While for VT-d, it's not the case. The input addresses of IOMMUs may not > in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a > separate address space. So for VT-d, IOMMUs are actually listening to different > address spaces. That's the point why we want to have address space level > abstraction of IOMMU. > > > > > > That's why we believe it is fine to > > > associate dma address space with an IOMMUObject. > > > > > > IIUC how SVM works, the whole point is that the device > > > > no longer writes into a specific PCI address space. Instead, it > > > > writes directly into a process address space. So it seems to me more > > > > that SVM should operate at the PCI level, and disassociate the device > > > > from the normal PCI address space entirely, rather than hooking up > > > > something via that address space. > > > > > > As Peter replied, we still need the PCI address space, it would be used > > > to build up the 2nd level page table which would be used in nested > > > translation. > > > > > > Thanks, > > > Yi L > > > > > > > > > > > > > > Regards, > Yi L >
On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > [...] > > > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 > > IOMMU MR and 1 AS per PCIe device, right? > > I think this is the most tricky point - in QEMU IOMMU MR is not really > a 1:1 relationship to devices. For Intel, it's true; for Power, it's > not. On Power guests, one device's DMA address space can be splited > into different translation windows, while each window corresponds to > one IOMMU MR. Right. > So IMHO the real 1:1 mapping is between the device and its DMA address > space, rather than MRs. That's not true either. With both POWER and Intel, several devices can share a DMA address space: on POWER if they are in the same PE, on Intel if they are place in the same IOMMU domain. On x86 and on POWER bare metal we generally try to make the minimum granularity for each PE/domain be a single function. However, that may not be possible in the case of PCIe to PCI bridges, or multifunction devices where the functions aren't properly isolated from each other (e.g. function 0 debug registers which can affect other functions are quite common). For POWER guests we only have one PE/domain per virtual host bridge. That's just a matter of implementation simplicity - if you want fine grained isolation you can just create more virtual host bridges. > It's been a long time since when I drafted the patches. I think at > least that should be a more general notifier mechanism comparing to > current IOMMUNotifier thing, which was bound to IOTLB notifies only. > AFAICT if we want to trap first-level translation changes, current > notifier is not even close to that interface - just see the definition > of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation > addresses, not anything else. And IMHO that's why it's tightly bound > to MemoryRegions, and that's the root problem. The dynamic IOMMU MR > switching problem is related to this issue as well. So, having read and thought a bunch more, I think I know where you need to start hooking this in. The thing is the current qemu PCI DMA structure assumes that each device belongs to just a single PCI address space - that's what pci_device_iommu_address_space() returns. For virt-SVM that's just not true. IIUC, a virt-SVM capable device could simultaneously write to multiple process address spaces, since the process IDs actually go over the bus. So trying to hook notifiers at the AddressSpace OR MemoryRegion level just doesn't make sense - if we've picked a single addresss space for the device, we've already made a wrong step. Instead what you need I think is something like: pci_device_virtsvm_context(). virt-SVM capable devices would need to call that *before* calling pci_device_iommu_address_space (). Well rather the virt-SVM capable DMA helpers would need to call that. That would return a new VirtSVMContext (or something) object, which would roughly correspond to a single PASID table. That's where the methods and notifiers for managing that would need to go. > I am not sure current "get IOMMU object from address space" solution > would be best, maybe it's "too bigger a scope", I think it depends on > whether in the future we'll have some requirement in such a bigger > scope (say, something we want to trap from vIOMMU and deliver it to > host IOMMU which may not even be device-related? I don't know). Now > another alternative I am thinking is, whether we can provide a > per-device notifier, then it can be bound to PCIDevice rather than > MemoryRegions, then it will be in device scope. I think that sounds like a version of what I've suggested above.
On Tue, Nov 14, 2017 at 11:21:59AM +0100, Auger Eric wrote: > Hi Yi L, > > On 03/11/2017 13:01, Liu, Yi L wrote: > > From: Peter Xu <peterx@redhat.com> > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > spaces to store arch-specific hooks. > > > > The first hook I would like to introduce is iommu_get(). Return an > > IOMMUObject behind the AddressSpace. > > David had an objection in the past about this method, saying that > several IOMMUs could translate a single AS? > > https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01610.html > > On ARM I think it works in general: > In > https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/pci/pci-iommu.txt, > it is said > "a given PCI device can only master through one IOMMU" That's using a platform specific meaning of what "one IOMMU" means. In general what's several IOMMUs and what's one IOMMU which responds to several address regions is not distinguishable from the device's point of view.
On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote: > On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote: > > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote: > > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: > > > > Hi David, > > > > > > > > On Tue, Nov 14, 2017 at 11:59:34AM +1100, David Gibson wrote: > > > > > On Mon, Nov 13, 2017 at 04:28:45PM +0800, Peter Xu wrote: > > > > > > On Mon, Nov 13, 2017 at 04:56:01PM +1100, David Gibson wrote: > > > > > > > On Fri, Nov 03, 2017 at 08:01:52PM +0800, Liu, Yi L wrote: > > > > > > > > From: Peter Xu <peterx@redhat.com> > > > > > > > > > > > > > > > > AddressSpaceOps is similar to MemoryRegionOps, it's just for address > > > > > > > > spaces to store arch-specific hooks. > > > > > > > > > > > > > > > > The first hook I would like to introduce is iommu_get(). Return an > > > > > > > > IOMMUObject behind the AddressSpace. > > > > > > > > > > > > > > > > For systems that have IOMMUs, we will create a special address > > > > > > > > space per device which is different from system default address > > > > > > > > space for it (please refer to pci_device_iommu_address_space()). > > > > > > > > Normally when that happens, there will be one specific IOMMU (or > > > > > > > > say, translation unit) stands right behind that new address space. > > > > > > > > > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, > > > > > > > > the guy is defined as IOMMUObject, which includes a notifier_list > > > > > > > > so far, may extend in future. Along with IOMMUObject, a new iommu > > > > > > > > notifier mechanism is introduced. It would be used for virt-svm. > > > > > > > > Also IOMMUObject can further have a IOMMUObjectOps which is similar > > > > > > > > to MemoryRegionOps. The difference is IOMMUObjectOps is not relied > > > > > > > > on MemoryRegion. > > > > > > > > > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > > > > > > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com> > > > > > > > > > > > > > > Hi, sorry I didn't reply to the earlier postings of this after our > > > > > > > discussion in China. I've been sick several times and very busy. > > > > > > > > > > > > > > I still don't feel like there's an adequate explanation of exactly > > > > > > > what an IOMMUObject represents. Obviously it can represent more than > > > > > > > a single translation window - since that's represented by the > > > > > > > IOMMUMR. But what exactly do all the MRs - or whatever else - that > > > > > > > are represented by the IOMMUObject have in common, from a functional > > > > > > > point of view. > > > > > > > > > > > > > > Even understanding the SVM stuff better than I did, I don't really see > > > > > > > why an AddressSpace is an obvious unit to have an IOMMUObject > > > > > > > associated with it. > > > > > > > > > > > > Here's what I thought about it: IOMMUObject was planned to be the > > > > > > abstraction of the hardware translation unit, which is a higher level > > > > > > of the translated address spaces. Say, for each PCI device, it can > > > > > > have its own translated address space. However for multiple PCI > > > > > > devices, they can be sharing the same translation unit that handles > > > > > > the translation requests from different devices. That's the case for > > > > > > Intel platforms. We introduced this IOMMUObject because sometimes we > > > > > > want to do something with that translation unit rather than a specific > > > > > > device, in which we need a general IOMMU device handle. > > > > > > > > > > Ok, but what does "hardware translation unit" mean in practice. The > > > > > guest neither knows nor cares, which bits of IOMMU translation happen > > > > > to be included in the same bundle of silicon. It only cares what the > > > > > behaviour is. What behavioural characteristics does a single > > > > > IOMMUObject have? > > > > > > > > > > > IIRC one issue left over during last time's discussion was that there > > > > > > could be more complicated IOMMU models. E.g., one device's DMA request > > > > > > can be translated nestedly by two or multiple IOMMUs, and current > > > > > > proposal cannot really handle that complicated hierachy. I'm just > > > > > > thinking whether we can start from a simple model (say, we don't allow > > > > > > nested IOMMUs, and actually we don't even allow multiple IOMMUs so > > > > > > far), then we can evolve from that point in the future. > > > > > > > > > > > > Also, I thought there were something you mentioned that this approach > > > > > > is not correct for Power systems, but I can't really remember the > > > > > > details... Anyways, I think this is not the only approach to solve > > > > > > the problem, and I believe any new better idea would be greatly > > > > > > welcomed as well. :) > > > > > > > > > > So, some of my initial comments were based on a misunderstanding of > > > > > what was proposed here - since discussing this with Yi at LinuxCon > > > > > Beijing, I have a better idea of what's going on. > > > > > > > > > > On POWER - or rather the "pseries" platform, which is paravirtualized. > > > > > We can have multiple vIOMMU windows (usually 2) for a single virtual > > > > > > > > On POWER, the DMA isolation is done by allocating different DMA window > > > > to different isolation domains? And a single isolation domain may include > > > > multiple dma windows? So with or withou IOMMU, there is only a single > > > > DMA address shared by all the devices in the system? The isolation > > > > mechanism is as what described above? > > > > > > No, the multiple windows are completely unrelated to how things are > > > isolated. > > > > I'm afraid I chose a wrong word by using "DMA window".. > > Actually, when mentioning "DMA window", I mean address ranges in an iova > > address space. > > Yes, so did I. My one window I mean one contiguous range of IOVA addresses. > > > Anyhow, let me re-shape my understanding of POWER IOMMU and > > make sure we are in the same page. > > > > > > > > Just like on x86, each IOMMU domain has independent IOMMU mappings. > > > The only difference is that IBM calls the domains "partitionable > > > endpoints" (PEs) and they tend to be statically created at boot time, > > > rather than runtime generated. > > > > Does POWER IOMMU also have iova concept? Device can use an iova to > > access memory, and IOMMU translates the iova to an address within the > > system physical address? > > Yes. When I say the "PCI address space" I mean the IOVA space. > > > > The windows are about what addresses in PCI space are translated by > > > the IOMMU. If the device generates a PCI cycle, only certain > > > addresses will be mapped by the IOMMU to DMA - other addresses will > > > correspond to other devices MMIOs, MSI vectors, maybe other things. > > > > I guess the windows you mentioned here is the address ranges within the > > system physical address space as you also mentioned MMIOs and etc. > > No. I mean ranges within the PCI space == IOVA space. It's simplest > to understand with traditional PCI. A cycle on the bus doesn't know > whether the destination is a device or memory, it just has an address > - a PCI memory address. Part of that address range is mapped to > system RAM, optionally with an IOMMU translating it. Other parts of > that address space are used for devices. > > With PCI-E things get more complicated, but the conceptual model is > the same. > > > > The set of addresses translated by the IOMMU need not be contiguous. > > > > I suppose you mean the output addresses of the IOMMU need not be > > contiguous? > > No. I mean the input addresses of the IOMMU. > > > > Or, there could be two IOMMUs on the bus, each accepting different > > > address ranges. These two situations are not distinguishable from the > > > guest's point of view. > > > > > > So for a typical PAPR setup, the device can access system RAM either > > > via DMA in the range 0..1GiB (the "32-bit window") or in the range > > > 2^59..2^59+<something> (the "64-bit window"). Typically the 32-bit > > > window has mappings dynamically created by drivers, and the 64-bit > > > window has all of system RAM mapped 1:1, but that's entirely up to the > > > OS, it can map each window however it wants. > > > > > > 32-bit devices (or "64 bit" devices which don't actually implement > > > enough the address bits) will only be able to use the 32-bit window, > > > of course. > > > > > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the > > > host bridge and other things exist outside those ranges. Those are > > > just the ranges which are used to DMA to RAM. > > > > > > Each PE (domain) can see a different version of what's in each > > > window. > > > > If I'm correct so far. PE actually defines a mapping between an address > > range of an address space(aka. iova address space) and an address range > > of the system physical address space. > > No. A PE means several things, but basically it is an isolation > domain, like an Intel IOMMU domain. Each PE has an independent set of > IOMMU mappings which translate part of the PCI address space to system > memory space. > > > Then my question is: does each PE define a separate iova address sapce > > which is flat from 0 - 2^AW -1, AW is address width? As a reference, > > VT-d domain defines a flat address space for each domain. > > Partly. Each PE has an address space which all devices in the PE see. > Only some of that address space is mapped to system memory though, > other parts are occupied by devices, others are unmapped. > > Only the parts mapped by the IOMMU vary between PEs - the other parts > of the address space will be identical for all PEs on the host Thx, this comment addressed me well. This is different from what we have on VT-d. > bridge. However for POWER guests (not for hosts) there is exactly one > PE for each virtual host bridge. > > > > In fact, if I understand the "IO hole" correctly, the situation on x86 > > > isn't very different. It has a window below the IO hole and a second > > > window above the IO hole. The addresses within the IO hole go to > > > (32-bit) devices on the PCI bus, rather than being translated by the > > > > If you mean the "IO hole" within the system physcial address space, I think > > it's yes. > > Well, really I mean the IO hole in PCI address space. Because system > address space and PCI memory space were traditionally identity mapped > on x86 this is easy to confuse though. > > > > IOMMU to RAM addresses. Because the gap is smaller between the two > > > windows, I think we get away without really modelling this detail in > > > qemu though. > > > > > > > > PCI host bridge. Because of the paravirtualization, the mapping to > > > > > hardware is fuzzy, but for passthrough devices they will both be > > > > > implemented by the IOMMU built into the physical host bridge. That > > > > > isn't importat to the guest, though - all operations happen at the > > > > > window level. > > > > > > > > On VT-d, with IOMMU presented, each isolation domain has its own address > > > > space. That's why we talked more on address space level. And iommu makes > > > > the difference. That's the behavioural characteristics a single iommu > > > > translation unit has. And thus an IOMMUObject going to have. > > > > > > Right, that's the same on POWER. But the IOMMU only translates *some* > > > addresses within the address space, not all of them. The rest will go > > > to other PCI devices or be unmapped, but won't go to RAM. > > > > > > That's why the IOMMU should really be associated with an MR (or > > > several MRs), not an AddressSpace, it only translates some addresses. > > > > If I'm correct so far, I do believe the major difference between VT-d and > > POWER IOMMU is that VT-d isolation domain is a flat address space while > > PE of POWER is something different(need your input here as I'm not sure about > > it). Maybe it's like there is a flat address space, each PE takes some address > > ranges and maps the address ranges to different system physcial address ranges. > > No, it's really not that different. In both cases (without virt-SVM) > there's a system memory address space, and a PCI address space for > each domain / PE. There are one or more "outbound" windows in system > memory space that map system memory cycles to PCI cycles (used by the > CPU to access MMIO) and one or more "inbound" (DMA) windows in PCI > memory space which map PCI cycles onto system memory cycles (used by > devices to access system memory). > > On old-style PCs, both inbound and outbound windows were (mostly) > identity maps. On POWER they are not. > > > > > > The other thing that bothers me here is the way it's attached to an > > > > > AddressSpace. > > > > > > > > My consideration is iommu handles AddressSpaces. dma address space is also > > > > an address space managed by iommu. > > > > > > No, it's not. It's a region (or several) within the overall PCI > > > address space. Other things in the addressspace, such as other > > > device's BARs exist independent of the IOMMU. > > > > > > It's not something that could really work with PCI-E, I think, but > > > with a more traditional PCI bus there's no reason you couldn't have > > > multiple IOMMUs listening on different regions of the PCI address > > > space. > > > > I think the point here is on POWER, the input addresses of IOMMUs are actaully > > in the same address space? > > I'm not sure what you mean, but I don't think so. Each PE has its own > IOMMU input address space. > > > What IOMMU does is mapping the different ranges to > > different system physcial address ranges. So it's as you mentioned, multiple > > IOMMUs listen on different regions of a PCI address space. > > No. That could be the case in theory, but it's not the usual case. > > Or rather it depends what you mean by "an IOMMU". For PAPR guests, > both IOVA 0..1GiB and 2^59..(somewhere) are mapped to system memory, > but with separate page tables. You could consider that two IOMMUs (we > mostly treat it that way in qemu). However, all the mapping is > handled by the same host bridge with 2 sets of page tables per PE, so > you could also call it one IOMMU. > > This is what I'm getting at when I say that "one IOMMU" is not a > clearly defined unit. > > > While for VT-d, it's not the case. The input addresses of IOMMUs may not > > in the same address sapce. As I mentioned, each IOMMU domain on VT-d is a > > separate address space. So for VT-d, IOMMUs are actually listening to different > > address spaces. That's the point why we want to have address space level > > abstraction of IOMMU. > > > > > > > > > That's why we believe it is fine to > > > > associate dma address space with an IOMMUObject. > > > > > > > > IIUC how SVM works, the whole point is that the device > > > > > no longer writes into a specific PCI address space. Instead, it > > > > > writes directly into a process address space. So it seems to me more > > > > > that SVM should operate at the PCI level, and disassociate the device > > > > > from the normal PCI address space entirely, rather than hooking up > > > > > something via that address space. After thinking more, I agree that it is not suitable to hook up something for 1st level via the PCI address space. In the time 1st and 2nd level translation is exposed to guest, a device would write to multiple address spaces. PCI address space is only one of them. I think your reply in another email is a good start, let me reply my thoughts under that email. Regards, Yi L > > > > > > > > As Peter replied, we still need the PCI address space, it would be used > > > > to build up the 2nd level page table which would be used in nested > > > > translation. > > > > > > > > Thanks, > > > > Yi L > > > > > > > > > > > > > > > > > > > > Regards, > > Yi L > > > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > [...] > > > > > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 > > > IOMMU MR and 1 AS per PCIe device, right? > > > > I think this is the most tricky point - in QEMU IOMMU MR is not really > > a 1:1 relationship to devices. For Intel, it's true; for Power, it's > > not. On Power guests, one device's DMA address space can be splited > > into different translation windows, while each window corresponds to > > one IOMMU MR. > > Right. > > > So IMHO the real 1:1 mapping is between the device and its DMA address > > space, rather than MRs. > > That's not true either. With both POWER and Intel, several devices > can share a DMA address space: on POWER if they are in the same PE, on > Intel if they are place in the same IOMMU domain. > > On x86 and on POWER bare metal we generally try to make the minimum > granularity for each PE/domain be a single function. However, that > may not be possible in the case of PCIe to PCI bridges, or > multifunction devices where the functions aren't properly isolated > from each other (e.g. function 0 debug registers which can affect > other functions are quite common). > > For POWER guests we only have one PE/domain per virtual host bridge. > That's just a matter of implementation simplicity - if you want fine > grained isolation you can just create more virtual host bridges. > > > It's been a long time since when I drafted the patches. I think at > > least that should be a more general notifier mechanism comparing to > > current IOMMUNotifier thing, which was bound to IOTLB notifies only. > > AFAICT if we want to trap first-level translation changes, current > > notifier is not even close to that interface - just see the definition > > of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation > > addresses, not anything else. And IMHO that's why it's tightly bound > > to MemoryRegions, and that's the root problem. The dynamic IOMMU MR > > switching problem is related to this issue as well. > > So, having read and thought a bunch more, I think I know where you > need to start hooking this in. The thing is the current qemu PCI DMA > structure assumes that each device belongs to just a single PCI > address space - that's what pci_device_iommu_address_space() returns. > > For virt-SVM that's just not true. IIUC, a virt-SVM capable device > could simultaneously write to multiple process address spaces, since > the process IDs actually go over the bus. Correct. > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level > just doesn't make sense - if we've picked a single addresss space for > the device, we've already made a wrong step. That's also why we want to have notifiers based on IOMMUObject(may be not a suitable name, let me use it as the patch named). > > Instead what you need I think is something like: > pci_device_virtsvm_context(). virt-SVM capable devices would need to > call that *before* calling pci_device_iommu_address_space (). Well > rather the virt-SVM capable DMA helpers would need to call that. > > That would return a new VirtSVMContext (or something) object, which > would roughly correspond to a single PASID table. That's where the > methods and notifiers for managing that would need to go. Correct, pci_device_iommu_address_space() returns an AS and it is a PCI address space. And if pci_device_virtsvm_context() is also called in vfio_realize(), it may not return an AS since there may be no 1st level translation page table bound. So as you said, return a new VirtSVMContext, this VirtSVMContext can hook some new notifiers. I think the IOMMUObject introduced in this patch can meet the requirement. But it may be re-named. So here it addressed the concern you raised before which is hook IOMMUObject via a PCI address space. Regards to VirtSVMContext, it may be a replacement of IOMMUObject. As it is related to PASID, I'm considering to name it as IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all the IOMMU PASID related operations. Regards, Yi L > > > I am not sure current "get IOMMU object from address space" solution > > would be best, maybe it's "too bigger a scope", I think it depends on > > whether in the future we'll have some requirement in such a bigger > > scope (say, something we want to trap from vIOMMU and deliver it to > > host IOMMU which may not even be device-related? I don't know). Now > > another alternative I am thinking is, whether we can provide a > > per-device notifier, then it can be bound to PCIDevice rather than > > MemoryRegions, then it will be in device scope. > > I think that sounds like a version of what I've suggested above. > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson
On Wed, Dec 20, 2017 at 02:32:42PM +0800, Liu, Yi L wrote: > On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote: > > On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote: > > > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote: > > > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: [snip] > > > > So for a typical PAPR setup, the device can access system RAM either > > > > via DMA in the range 0..1GiB (the "32-bit window") or in the range > > > > 2^59..2^59+<something> (the "64-bit window"). Typically the 32-bit > > > > window has mappings dynamically created by drivers, and the 64-bit > > > > window has all of system RAM mapped 1:1, but that's entirely up to the > > > > OS, it can map each window however it wants. > > > > > > > > 32-bit devices (or "64 bit" devices which don't actually implement > > > > enough the address bits) will only be able to use the 32-bit window, > > > > of course. > > > > > > > > MMIOs of other devices, the "magic" MSI-X addresses belonging to the > > > > host bridge and other things exist outside those ranges. Those are > > > > just the ranges which are used to DMA to RAM. > > > > > > > > Each PE (domain) can see a different version of what's in each > > > > window. > > > > > > If I'm correct so far. PE actually defines a mapping between an address > > > range of an address space(aka. iova address space) and an address range > > > of the system physical address space. > > > > No. A PE means several things, but basically it is an isolation > > domain, like an Intel IOMMU domain. Each PE has an independent set of > > IOMMU mappings which translate part of the PCI address space to system > > memory space. > > > > > Then my question is: does each PE define a separate iova address sapce > > > which is flat from 0 - 2^AW -1, AW is address width? As a reference, > > > VT-d domain defines a flat address space for each domain. > > > > Partly. Each PE has an address space which all devices in the PE see. > > Only some of that address space is mapped to system memory though, > > other parts are occupied by devices, others are unmapped. > > > > Only the parts mapped by the IOMMU vary between PEs - the other parts > > of the address space will be identical for all PEs on the host > > Thx, this comment addressed me well. This is different from what we have > on VT-d. Really? That's hard to believe. I'm pretty sure the VT-d IOMMU must have a range < 2^64, and anything on the bus outside that range I expect would be common between all domains. In particular I'd expect the BARs for other devices not to be remapped by the IOMMU (though they may be inaccessible on PCI-E due peer to peer transactions being blocked). As well as things above the IOMMU's range, I'd expect the region for 32-bit BARs to be common between all domains. [snip] > > > > > > IIUC how SVM works, the whole point is that the device > > > > > > no longer writes into a specific PCI address space. Instead, it > > > > > > writes directly into a process address space. So it seems to me more > > > > > > that SVM should operate at the PCI level, and disassociate the device > > > > > > from the normal PCI address space entirely, rather than hooking up > > > > > > something via that address space. > > After thinking more, I agree that it is not suitable to hook up something for > 1st level via the PCI address space. In the time 1st and 2nd level translation > is exposed to guest, a device would write to multiple address spaces. PCI address > space is only one of them. I think your reply in another email is a good start, > let me reply my thoughts under that email. > > Regards, > Yi L > > > > > > > > > > > As Peter replied, we still need the PCI address space, it would be used > > > > > to build up the 2nd level page table which would be used in nested > > > > > translation. > > > > > > > > > > Thanks, > > > > > Yi L > > > > > > > > > > > > > > > > > > > > > > > > > > Regards, > > > Yi L > > > > > > >
On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > > > [...] > > > > > > > I meant, in the current intel_iommu code, vtd_find_add_as() creates 1 > > > > IOMMU MR and 1 AS per PCIe device, right? > > > > > > I think this is the most tricky point - in QEMU IOMMU MR is not really > > > a 1:1 relationship to devices. For Intel, it's true; for Power, it's > > > not. On Power guests, one device's DMA address space can be splited > > > into different translation windows, while each window corresponds to > > > one IOMMU MR. > > > > Right. > > > > > So IMHO the real 1:1 mapping is between the device and its DMA address > > > space, rather than MRs. > > > > That's not true either. With both POWER and Intel, several devices > > can share a DMA address space: on POWER if they are in the same PE, on > > Intel if they are place in the same IOMMU domain. > > > > On x86 and on POWER bare metal we generally try to make the minimum > > granularity for each PE/domain be a single function. However, that > > may not be possible in the case of PCIe to PCI bridges, or > > multifunction devices where the functions aren't properly isolated > > from each other (e.g. function 0 debug registers which can affect > > other functions are quite common). > > > > For POWER guests we only have one PE/domain per virtual host bridge. > > That's just a matter of implementation simplicity - if you want fine > > grained isolation you can just create more virtual host bridges. > > > > > It's been a long time since when I drafted the patches. I think at > > > least that should be a more general notifier mechanism comparing to > > > current IOMMUNotifier thing, which was bound to IOTLB notifies only. > > > AFAICT if we want to trap first-level translation changes, current > > > notifier is not even close to that interface - just see the definition > > > of IOMMUTLBEntry, it is tailored only for MAP/UNMAP of translation > > > addresses, not anything else. And IMHO that's why it's tightly bound > > > to MemoryRegions, and that's the root problem. The dynamic IOMMU MR > > > switching problem is related to this issue as well. > > > > So, having read and thought a bunch more, I think I know where you > > need to start hooking this in. The thing is the current qemu PCI DMA > > structure assumes that each device belongs to just a single PCI > > address space - that's what pci_device_iommu_address_space() returns. > > > > For virt-SVM that's just not true. IIUC, a virt-SVM capable device > > could simultaneously write to multiple process address spaces, since > > the process IDs actually go over the bus. > > Correct. > > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level > > just doesn't make sense - if we've picked a single addresss space for > > the device, we've already made a wrong step. > > That's also why we want to have notifiers based on IOMMUObject(may be > not a suitable name, let me use it as the patch named). Right, I think "IOMMUObject" is a misleading name. > > Instead what you need I think is something like: > > pci_device_virtsvm_context(). virt-SVM capable devices would need to > > call that *before* calling pci_device_iommu_address_space (). Well > > rather the virt-SVM capable DMA helpers would need to call that. > > > > That would return a new VirtSVMContext (or something) object, which > > would roughly correspond to a single PASID table. That's where the > > methods and notifiers for managing that would need to go. > > Correct, pci_device_iommu_address_space() returns an AS and it is > a PCI address space. And if pci_device_virtsvm_context() is also > called in vfio_realize(), it may not return an AS since there may > be no 1st level translation page table bound. > > So as you said, return a new VirtSVMContext, this VirtSVMContext can > hook some new notifiers. I think the IOMMUObject introduced in this patch > can meet the requirement. But it may be re-named. Ok. > So here it addressed the concern you raised before which is hook IOMMUObject > via a PCI address space. Regards to VirtSVMContext, it may be a replacement > of IOMMUObject. As it is related to PASID, I'm considering to name it as > IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all > the IOMMU PASID related operations. I'm ok with calling it a "PASID context". Thinking about this some more, here are some extra observations: * I think each device needs both a PASID context and an ordinary address space. The PASID context would be used for bus transactions which include a process id, the address space for those that don't. * Theoretically, the PASID context could be modelled as an array/map of AddressSpace objects for each process ID. However, creating all those AddressSpace objects in advance might be too expensive. I can see a couple of options to avoid this: 1) Have the PASID context class include a 'translate' method similar to the one in IOMMUMemoryRegionClass, but taking a process ID as well as an address. This would avoid creating extra AddressSpace objects, but might require duplicating a bunch of the translation code that already exists for AddressSpace. 2) "Lazily" create AddressSpace objects. The generic part of the PASID aware DMA helper functions would use a cache of AddressSpace's for particular process IDs, using the AddressSpace (and MemoryRegion within) to translate accesses for a particular process ID. However, these AddressSpace and MemoryRegion objects would only be created when the device first accesses that address space. In the common case, where a single device is just being used by a single process or a small number, this should keep the number of AddressSpace objects relatively small. Obviously the cache would need to be invalidated, cleaning up the AddressSpace objects, when the PASID table is altered. * I realize that the expected case here is with KVM, where the guest controls the first level translation, but the host controls the second level translation. However, we should also be able to model the case where the guest controls both levels for the sake of full system emulation. I think understanding this case will lead to a better design even for the simpler case. Do you have a plan for what the virt-SVM aware DMA functions will look like?
On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote: > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > > > > > [...] > > > So, having read and thought a bunch more, I think I know where you > > > need to start hooking this in. The thing is the current qemu PCI DMA > > > structure assumes that each device belongs to just a single PCI > > > address space - that's what pci_device_iommu_address_space() returns. > > > > > > For virt-SVM that's just not true. IIUC, a virt-SVM capable device > > > could simultaneously write to multiple process address spaces, since > > > the process IDs actually go over the bus. > > > > Correct. > > > > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level > > > just doesn't make sense - if we've picked a single addresss space for > > > the device, we've already made a wrong step. > > > > That's also why we want to have notifiers based on IOMMUObject(may be > > not a suitable name, let me use it as the patch named). > > Right, I think "IOMMUObject" is a misleading name. > > > > Instead what you need I think is something like: > > > pci_device_virtsvm_context(). virt-SVM capable devices would need to > > > call that *before* calling pci_device_iommu_address_space (). Well > > > rather the virt-SVM capable DMA helpers would need to call that. > > > > > > That would return a new VirtSVMContext (or something) object, which > > > would roughly correspond to a single PASID table. That's where the > > > methods and notifiers for managing that would need to go. > > > > Correct, pci_device_iommu_address_space() returns an AS and it is > > a PCI address space. And if pci_device_virtsvm_context() is also > > called in vfio_realize(), it may not return an AS since there may > > be no 1st level translation page table bound. > > > > So as you said, return a new VirtSVMContext, this VirtSVMContext can > > hook some new notifiers. I think the IOMMUObject introduced in this patch > > can meet the requirement. But it may be re-named. > > Ok. > > > So here it addressed the concern you raised before which is hook IOMMUObject > > via a PCI address space. Regards to VirtSVMContext, it may be a replacement > > of IOMMUObject. As it is related to PASID, I'm considering to name it as > > IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all > > the IOMMU PASID related operations. > > I'm ok with calling it a "PASID context". > > Thinking about this some more, here are some extra observations: > > * I think each device needs both a PASID context and an ordinary > address space. The PASID context would be used for bus > transactions which include a process id, the address space for > those that don't. Correct. Also virt-SVM still needs the PCI Address space. And the PCI Address space == Guest physical Address space. For virt-SVM, requires pt mode to ensure the nested translation. > > * Theoretically, the PASID context could be modelled as an array/map > of AddressSpace objects for each process ID. However, creating all I also thought about creating AddressSpace objects for each process ID. But I don't think we need to do it. My reason as below: In theory, it is correct to have AddressSpace object for each process virtual address space in Qemu, and this is what we are doing for PCI address space so far. However, this is necessary when we want to mirror the mapping to host. Each time there is mapping changed within the PCI address space, we need to mirror it to host. While for virt-SVM, we don't need to mirror the changes within the guest process address space. The nested translation capability in HW brings us a convenience. In nested translation, HW can access a guest PASID table with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table, it is not necessary to mirror it to host. Based on the statements above, there is a candidate function to be included in PASIDContext. It could be bind_guest_pasid_table(). And be used to set the guest PASID Table to host translation structure when guest finds a device is has SVM capability. > those AddressSpace objects in advance might be too expensive. I > can see a couple of options to avoid this: > > 1) Have the PASID context class include a 'translate' method similar > to the one in IOMMUMemoryRegionClass, but taking a process ID as well > as an address. This would avoid creating extra AddressSpace objects, > but might require duplicating a bunch of the translation code that > already exists for AddressSpace. Translation code actually walks the guest CR3 table to get a GVA->GPA map. But it is not really required so far due to HW capability. > > 2) "Lazily" create AddressSpace objects. The generic part of the > PASID aware DMA helper functions would use a cache of AddressSpace's > for particular process IDs, using the AddressSpace (and MemoryRegion > within) to translate accesses for a particular process ID. However, > these AddressSpace and MemoryRegion objects would only be created when > the device first accesses that address space. In the common case, > where a single device is just being used by a single process or a > small number, this should keep the number of AddressSpace objects > relatively small. Obviously the cache would need to be invalidated, > cleaning up the AddressSpace objects, when the PASID table is altered. This "Lazily" mechanism is not required. But I agree with the last statement. When PASID Table altered, the cache should be invalidated. Additionally, the cache for the mappings(GVA) within the guest process address space should also be invalidated when there is change. This also calls for a candidate function in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate GVA cache and also PASID entry cache. > > * I realize that the expected case here is with KVM, where the guest > controls the first level translation, but the host controls the > second level translation. However, we should also be able to model Yeah, this is the case. And to be accurate, both the 1st level and 2nd level translation here happens on HW IOMMU. The 1st level page table is actually like "linked" from guest. And the 2nd level page table is created/conrolled by Qemu(with MAP/UNMAP). > the case where the guest controls both levels for the sake of full > system emulation. I think understanding this case will lead to a > better design even for the simpler case. Not sure if I got your point accurately. Let me try to reply based on what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and virt-SVM. I think this is a case which guest controls both 1st and 2nd level translation. For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka. shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough. Peter has already upstreamed it. > > Do you have a plan for what the virt-SVM aware DMA functions will look > like? I think there are two candidates so far. This should be enough for VT-d and AMD-IOMMU. For ARM-SMMU, it may need extra function. * bind_guest_pasid_table() * svm_invalidate_tlb() > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson Thanks, Yi L
On Wed, Dec 20, 2017 at 10:01:10PM +1100, David Gibson wrote: > On Wed, Dec 20, 2017 at 02:32:42PM +0800, Liu, Yi L wrote: > > On Mon, Dec 18, 2017 at 10:22:18PM +1100, David Gibson wrote: > > > On Mon, Dec 18, 2017 at 05:17:35PM +0800, Liu, Yi L wrote: > > > > On Mon, Dec 18, 2017 at 05:14:42PM +1100, David Gibson wrote: > > > > > On Thu, Nov 16, 2017 at 04:57:09PM +0800, Liu, Yi L wrote: [snip] > > > Partly. Each PE has an address space which all devices in the PE see. > > > Only some of that address space is mapped to system memory though, > > > other parts are occupied by devices, others are unmapped. > > > > > > Only the parts mapped by the IOMMU vary between PEs - the other parts > > > of the address space will be identical for all PEs on the host > > > > Thx, this comment addressed me well. This is different from what we have > > on VT-d. > > Really? That's hard to believe. I'm pretty sure the VT-d IOMMU must > have a range < 2^64, and anything on the bus outside that range I > expect would be common between all domains. In particular I'd expect > the BARs for other devices not to be remapped by the IOMMU (though > they may be inaccessible on PCI-E due peer to peer transactions being > blocked). As well as things above the IOMMU's range, I'd expect the > region for 32-bit BARs to be common between all domains. Sorry I misunderstood you. In each IOVA space, there is reserved range , it is the BARs MMIO range. Such reservation is to avoid un-expected Peer-To-Peer transaction. So regards to the IOVA space, all vendors should be similar. So you are right~ Thanks, Yi L
On Thu, Dec 21, 2017 at 04:40:19PM +0800, Liu, Yi L wrote: > On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote: > > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > > > > > > > > [...] > > > > So, having read and thought a bunch more, I think I know where you > > > > need to start hooking this in. The thing is the current qemu PCI DMA > > > > structure assumes that each device belongs to just a single PCI > > > > address space - that's what pci_device_iommu_address_space() returns. > > > > > > > > For virt-SVM that's just not true. IIUC, a virt-SVM capable device > > > > could simultaneously write to multiple process address spaces, since > > > > the process IDs actually go over the bus. > > > > > > Correct. > > > > > > > So trying to hook notifiers at the AddressSpace OR MemoryRegion level > > > > just doesn't make sense - if we've picked a single addresss space for > > > > the device, we've already made a wrong step. > > > > > > That's also why we want to have notifiers based on IOMMUObject(may be > > > not a suitable name, let me use it as the patch named). > > > > Right, I think "IOMMUObject" is a misleading name. > > > > > > Instead what you need I think is something like: > > > > pci_device_virtsvm_context(). virt-SVM capable devices would need to > > > > call that *before* calling pci_device_iommu_address_space (). Well > > > > rather the virt-SVM capable DMA helpers would need to call that. > > > > > > > > That would return a new VirtSVMContext (or something) object, which > > > > would roughly correspond to a single PASID table. That's where the > > > > methods and notifiers for managing that would need to go. > > > > > > Correct, pci_device_iommu_address_space() returns an AS and it is > > > a PCI address space. And if pci_device_virtsvm_context() is also > > > called in vfio_realize(), it may not return an AS since there may > > > be no 1st level translation page table bound. > > > > > > So as you said, return a new VirtSVMContext, this VirtSVMContext can > > > hook some new notifiers. I think the IOMMUObject introduced in this patch > > > can meet the requirement. But it may be re-named. > > > > Ok. > > > > > So here it addressed the concern you raised before which is hook IOMMUObject > > > via a PCI address space. Regards to VirtSVMContext, it may be a replacement > > > of IOMMUObject. As it is related to PASID, I'm considering to name it as > > > IOMMUPasidContext or IOMMUPasidObject. So it would be an abstraction of all > > > the IOMMU PASID related operations. > > > > I'm ok with calling it a "PASID context". > > > > Thinking about this some more, here are some extra observations: > > > > * I think each device needs both a PASID context and an ordinary > > address space. The PASID context would be used for bus > > transactions which include a process id, the address space for > > those that don't. > > Correct. Also virt-SVM still needs the PCI Address space. And the PCI > Address space == Guest physical Address space. Not necessarily. That's true if you're only making the L1 translation guest visible. But I think we need to at least think about the case where both L1 and L2 translations are guest visible, in which case the PCI address space is not the same as the guest physical address space. > For virt-SVM, requires > pt mode to ensure the nested translation. What is pt mode? > > * Theoretically, the PASID context could be modelled as an array/map > > of AddressSpace objects for each process ID. However, creating all > > I also thought about creating AddressSpace objects for each process ID. > But I don't think we need to do it. My reason as below: > > In theory, it is correct to have AddressSpace object for each process > virtual address space in Qemu, and this is what we are doing for PCI > address space so far. However, this is necessary when we want to mirror > the mapping to host. Each time there is mapping changed within the PCI > address space, we need to mirror it to host. > > While for virt-SVM, we don't need to mirror the changes within the guest > process address space. The nested translation capability in HW brings us > a convenience. In nested translation, HW can access a guest PASID table > with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or > maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table, > it is not necessary to mirror it to host. Based on the statements above, > there is a candidate function to be included in PASIDContext. It could be > bind_guest_pasid_table(). And be used to set the guest PASID Table to host > translation structure when guest finds a device is has SVM > capability. That's true for passthrough devices, but not for qemu emulated devices. None of those support SVM yet, but there's no reason they couldn't in future. Even though that will never be the main production case, I think we'll get a better model if we think about these edge cases carefully. > > those AddressSpace objects in advance might be too expensive. I > > can see a couple of options to avoid this: > > > > 1) Have the PASID context class include a 'translate' method similar > > to the one in IOMMUMemoryRegionClass, but taking a process ID as well > > as an address. This would avoid creating extra AddressSpace objects, > > but might require duplicating a bunch of the translation code that > > already exists for AddressSpace. > > Translation code actually walks the guest CR3 table to get a > GVA->GPA map. Right, but that's clearly x86 specific. > But it is not really required so far due to HW capability. > > > > > 2) "Lazily" create AddressSpace objects. The generic part of the > > PASID aware DMA helper functions would use a cache of AddressSpace's > > for particular process IDs, using the AddressSpace (and MemoryRegion > > within) to translate accesses for a particular process ID. However, > > these AddressSpace and MemoryRegion objects would only be created when > > the device first accesses that address space. In the common case, > > where a single device is just being used by a single process or a > > small number, this should keep the number of AddressSpace objects > > relatively small. Obviously the cache would need to be invalidated, > > cleaning up the AddressSpace objects, when the PASID table is altered. > > This "Lazily" mechanism is not required. But I agree with the last statement. > When PASID Table altered, the cache should be invalidated. Additionally, the > cache for the mappings(GVA) within the guest process address space should also > be invalidated when there is change. This also calls for a candidate function > in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate > GVA cache and also PASID entry cache. > > > > > * I realize that the expected case here is with KVM, where the guest > > controls the first level translation, but the host controls the > > second level translation. However, we should also be able to model > > Yeah, this is the case. And to be accurate, both the 1st level and 2nd level > translation here happens on HW IOMMU. The 1st level page table is actually > like "linked" from guest. And the 2nd level page table is created/conrolled > by Qemu(with MAP/UNMAP). > > > the case where the guest controls both levels for the sake of full > > system emulation. I think understanding this case will lead to a > > better design even for the simpler case. > > Not sure if I got your point accurately. Let me try to reply based on > what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and > virt-SVM. I think this is a case which guest controls both 1st and 2nd > level translation. That sounds right, from my limited knowledge of the x86 IOMMU. > For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka. > shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough. > Peter has already upstreamed it. Right, but we also need to model the translations for emulated devices. > > Do you have a plan for what the virt-SVM aware DMA functions will look > > like? > > I think there are two candidates so far. This should be enough for VT-d and > AMD-IOMMU. For ARM-SMMU, it may need extra function. > * bind_guest_pasid_table() > * svm_invalidate_tlb() That's not really what I meant. What I was getting it if you were writing an emulated device which supported SVM, what would the functions to perform SVM-aware DMA look like?
On Wed, Jan 03, 2018 at 11:28:17AM +1100, David Gibson wrote: > On Thu, Dec 21, 2017 at 04:40:19PM +0800, Liu, Yi L wrote: > > On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote: > > > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > > > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > > > > > > > > > [...] > > > > > > I'm ok with calling it a "PASID context". > > > > > > Thinking about this some more, here are some extra observations: > > > > > > * I think each device needs both a PASID context and an ordinary > > > address space. The PASID context would be used for bus > > > transactions which include a process id, the address space for > > > those that don't. > > > > Correct. Also virt-SVM still needs the PCI Address space. And the PCI > > Address space == Guest physical Address space. > > Not necessarily. That's true if you're only making the L1 translation > guest visible. But I think we need to at least think about the case > where both L1 and L2 translations are guest visible, in which case the > PCI address space is not the same as the guest physical address space. > > > For virt-SVM, requires > > pt mode to ensure the nested translation. > > What is pt mode? The pt mode here means the kernel parameter "iommu=pt" which means iommu do 1:1 mapping for iova. For t virt-SVM on VT-d, with guest set iommu=pt, the 2nd level page table in host would be a GPA->HPA mapping. If not, the host 2nd level page table would be a GIOVA->HPA mapping which is not expected in nested translation. > > > * Theoretically, the PASID context could be modelled as an array/map > > > of AddressSpace objects for each process ID. However, creating all > > > > I also thought about creating AddressSpace objects for each process ID. > > But I don't think we need to do it. My reason as below: > > > > In theory, it is correct to have AddressSpace object for each process > > virtual address space in Qemu, and this is what we are doing for PCI > > address space so far. However, this is necessary when we want to mirror > > the mapping to host. Each time there is mapping changed within the PCI > > address space, we need to mirror it to host. > > > > While for virt-SVM, we don't need to mirror the changes within the guest > > process address space. The nested translation capability in HW brings us > > a convenience. In nested translation, HW can access a guest PASID table > > with a GPA(this is what Intel and AMD does, not sure about ARM, maybe or > > maybe not). For VT-d and AMD-IOMMU, even any change in guest PASID table, > > it is not necessary to mirror it to host. Based on the statements above, > > there is a candidate function to be included in PASIDContext. It could be > > bind_guest_pasid_table(). And be used to set the guest PASID Table to host > > translation structure when guest finds a device is has SVM > > capability. > > That's true for passthrough devices, but not for qemu emulated > devices. None of those support SVM yet, but there's no reason they > couldn't in future. > > Even though that will never be the main production case, I think we'll > get a better model if we think about these edge cases carefully. Yeah, it's a quite good edge case. If an emulated device want to do DMA to a guest process virtual address space, Qemu needs to know it and do necessary address translation for it just as iova. If we want to support emulated SVM capable device, not sure if the current AddressSpace in Qemu can fit well. Honestly, SVM capable devices are major complicated accelerators. So I think we need to balance the efort. But I agree it is helpful to understand more about the edge cases. > > > those AddressSpace objects in advance might be too expensive. I > > > can see a couple of options to avoid this: > > > > > > 1) Have the PASID context class include a 'translate' method similar > > > to the one in IOMMUMemoryRegionClass, but taking a process ID as well > > > as an address. This would avoid creating extra AddressSpace objects, > > > but might require duplicating a bunch of the translation code that > > > already exists for AddressSpace. > > > > Translation code actually walks the guest CR3 table to get a > > GVA->GPA map. > > Right, but that's clearly x86 specific. Yes, CR3 table is x86 specific. While for page table walking, I think it can be vendor-agnostic. > > > But it is not really required so far due to HW capability. > > > > > > > > 2) "Lazily" create AddressSpace objects. The generic part of the > > > PASID aware DMA helper functions would use a cache of AddressSpace's > > > for particular process IDs, using the AddressSpace (and MemoryRegion > > > within) to translate accesses for a particular process ID. However, > > > these AddressSpace and MemoryRegion objects would only be created when > > > the device first accesses that address space. In the common case, > > > where a single device is just being used by a single process or a > > > small number, this should keep the number of AddressSpace objects > > > relatively small. Obviously the cache would need to be invalidated, > > > cleaning up the AddressSpace objects, when the PASID table is altered. > > > > This "Lazily" mechanism is not required. But I agree with the last statement. > > When PASID Table altered, the cache should be invalidated. Additionally, the > > cache for the mappings(GVA) within the guest process address space should also > > be invalidated when there is change. This also calls for a candidate function > > in PASIDContext. It could be svm_invalidate_tlb(). And be used to invalidate > > GVA cache and also PASID entry cache. > > > > > > > > * I realize that the expected case here is with KVM, where the guest > > > controls the first level translation, but the host controls the > > > second level translation. However, we should also be able to model > > > > Yeah, this is the case. And to be accurate, both the 1st level and 2nd level > > translation here happens on HW IOMMU. The 1st level page table is actually > > like "linked" from guest. And the 2nd level page table is created/conrolled > > by Qemu(with MAP/UNMAP). > > > > > the case where the guest controls both levels for the sake of full > > > system emulation. I think understanding this case will lead to a > > > better design even for the simpler case. > > > > Not sure if I got your point accurately. Let me try to reply based on > > what I got. If guest enables virt-IOVA(DMA isolaton in guest scope) and > > virt-SVM. I think this is a case which guest controls both 1st and 2nd > > level translation. > > That sounds right, from my limited knowledge of the x86 IOMMU. > > > For virt-IOVA, we need to mirror all the guest IOVA mapping to host(aka. > > shadow). Current MemoryRegion based notifiers(MAP/UNMAP) should be enough. > > Peter has already upstreamed it. > > Right, but we also need to model the translations for emulated > devices. For emulated device, I think the virt-IOVA is workable with latest Qemu. > > > > Do you have a plan for what the virt-SVM aware DMA functions will look > > > like? > > > > I think there are two candidates so far. This should be enough for VT-d and > > AMD-IOMMU. For ARM-SMMU, it may need extra function. > > * bind_guest_pasid_table() > > * svm_invalidate_tlb() > > That's not really what I meant. What I was getting it if you were > writing an emulated device which supported SVM, what would the > functions to perform SVM-aware DMA look like? Sorry for the misunderstanding, I'm not writing an emulated device with SVM capability. For a SVM capable device, in brief, it needs to be able to send Memory Request with PASID. If an emulated device tries to access a process virtual address space, Qemu needs to know it and do the translation for it. That's the SVM-awar DAM should look like. However, I haven't got a model for it. Probably needs to add pasid field in AddressSpace. > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson Regards, Yi L
On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote: > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > [...] Sorry for the delayed reply, spent some time on reconsidering your comments. > > I'm ok with calling it a "PASID context". > > Thinking about this some more, here are some extra observations: > > * I think each device needs both a PASID context and an ordinary > address space. The PASID context would be used for bus > transactions which include a process id, the address space for > those that don't. > > * Theoretically, the PASID context could be modelled as an array/map > of AddressSpace objects for each process ID. However, creating all > those AddressSpace objects in advance might be too expensive. I > can see a couple of options to avoid this: > > 1) Have the PASID context class include a 'translate' method similar > to the one in IOMMUMemoryRegionClass, but taking a process ID as well > as an address. This would avoid creating extra AddressSpace objects, > but might require duplicating a bunch of the translation code that > already exists for AddressSpace. > > 2) "Lazily" create AddressSpace objects. The generic part of the > PASID aware DMA helper functions would use a cache of AddressSpace's > for particular process IDs, using the AddressSpace (and MemoryRegion > within) to translate accesses for a particular process ID. However, > these AddressSpace and MemoryRegion objects would only be created when > the device first accesses that address space. In the common case, > where a single device is just being used by a single process or a > small number, this should keep the number of AddressSpace objects > relatively small. Obviously the cache would need to be invalidated, > cleaning up the AddressSpace objects, when the PASID table is altered. Sorry, a double check here. Does "AddressSpace objects" mean the existing AddressSpace definition in Qemu? > > * I realize that the expected case here is with KVM, where the guest > controls the first level translation, but the host controls the > second level translation. However, we should also be able to model > the case where the guest controls both levels for the sake of full > system emulation. I think understanding this case will lead to a > better design even for the simpler case. > > Do you have a plan for what the virt-SVM aware DMA functions will look > like? The behaviour is device specific. For a SVM capable physcial device, it would store the pasid value in a register locates in the deivce. e.g. a GPU context can be set to use SVM, after the pasid is set, any DMA from this context is DMAs target to a process virtual address space. So for a virt-SVM aware DMA device, the device model needs to figure out the target address space. With the correct address space, then consume the translate() callback provided by iommu emulator. And then emulate the DMA operation for the emulated device. I'll try to get a new version with your suggestions. Thanks, Yi L
On Fri, Jan 12, 2018 at 06:25:34PM +0800, Liu, Yi L wrote: > On Wed, Dec 20, 2017 at 10:18:16PM +1100, David Gibson wrote: > > On Wed, Dec 20, 2017 at 02:47:30PM +0800, Liu, Yi L wrote: > > > On Mon, Dec 18, 2017 at 10:35:31PM +1100, David Gibson wrote: > > > > On Wed, Nov 15, 2017 at 03:16:32PM +0800, Peter Xu wrote: > > > > > On Tue, Nov 14, 2017 at 10:52:54PM +0100, Auger Eric wrote: > > > > > > > [...] > > Sorry for the delayed reply, spent some time on reconsidering your comments. > > > > > I'm ok with calling it a "PASID context". > > > > Thinking about this some more, here are some extra observations: > > > > * I think each device needs both a PASID context and an ordinary > > address space. The PASID context would be used for bus > > transactions which include a process id, the address space for > > those that don't. > > > > * Theoretically, the PASID context could be modelled as an array/map > > of AddressSpace objects for each process ID. However, creating all > > those AddressSpace objects in advance might be too expensive. I > > can see a couple of options to avoid this: > > > > 1) Have the PASID context class include a 'translate' method similar > > to the one in IOMMUMemoryRegionClass, but taking a process ID as well > > as an address. This would avoid creating extra AddressSpace objects, > > but might require duplicating a bunch of the translation code that > > already exists for AddressSpace. > > > > 2) "Lazily" create AddressSpace objects. The generic part of the > > PASID aware DMA helper functions would use a cache of AddressSpace's > > for particular process IDs, using the AddressSpace (and MemoryRegion > > within) to translate accesses for a particular process ID. However, > > these AddressSpace and MemoryRegion objects would only be created when > > the device first accesses that address space. In the common case, > > where a single device is just being used by a single process or a > > small number, this should keep the number of AddressSpace objects > > relatively small. Obviously the cache would need to be invalidated, > > cleaning up the AddressSpace objects, when the PASID table is altered. > > Sorry, a double check here. Does "AddressSpace objects" mean the existing > AddressSpace definition in Qemu? Yes. > > * I realize that the expected case here is with KVM, where the guest > > controls the first level translation, but the host controls the > > second level translation. However, we should also be able to model > > the case where the guest controls both levels for the sake of full > > system emulation. I think understanding this case will lead to a > > better design even for the simpler case. > > > > Do you have a plan for what the virt-SVM aware DMA functions will look > > like? > > The behaviour is device specific. > For a SVM capable physcial device, it would store the pasid value in a > register locates in the deivce. e.g. a GPU context can be set to use SVM, > after the pasid is set, any DMA from this context is DMAs target to a > process virtual address space. That doesn't sound any more device specific than any DMA operation, and we have helpers for that. > So for a virt-SVM aware DMA device, the device model needs to figure out > the target address space. With the correct address space, then consume > the translate() callback provided by iommu emulator. And then emulate the > DMA operation for the emulated device. Nearly all of that sounds like something that belongs in a helper function. Basically a varaint of dma_memory_rw() (and related functions) that takes a PASID as well as an address. > I'll try to get a new version with your suggestions.
diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs index f8d7a4a..d688412 100644 --- a/hw/core/Makefile.objs +++ b/hw/core/Makefile.objs @@ -5,6 +5,7 @@ common-obj-y += fw-path-provider.o # irq.o needed for qdev GPIO handling: common-obj-y += irq.o common-obj-y += hotplug.o +common-obj-y += iommu.o common-obj-y += nmi.o common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o diff --git a/hw/core/iommu.c b/hw/core/iommu.c new file mode 100644 index 0000000..7c4fcfe --- /dev/null +++ b/hw/core/iommu.c @@ -0,0 +1,58 @@ +/* + * QEMU emulation of IOMMU logic + * + * Copyright (C) 2017 Red Hat Inc. + * + * Authors: Peter Xu <peterx@redhat.com>, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/core/iommu.h" + +void iommu_notifier_register(IOMMUObject *iommu, + IOMMUNotifier *n, + IOMMUNotifyFn fn, + IOMMUEvent event) +{ + n->event = event; + n->iommu_notify = fn; + QLIST_INSERT_HEAD(&iommu->iommu_notifiers, n, node); + return; +} + +void iommu_notifier_unregister(IOMMUObject *iommu, + IOMMUNotifier *notifier) +{ + IOMMUNotifier *cur, *next; + + QLIST_FOREACH_SAFE(cur, &iommu->iommu_notifiers, node, next) { + if (cur == notifier) { + QLIST_REMOVE(cur, node); + break; + } + } +} + +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data) +{ + IOMMUNotifier *cur; + + QLIST_FOREACH(cur, &iommu->iommu_notifiers, node) { + if ((cur->event == event_data->event) && cur->iommu_notify) { + cur->iommu_notify(cur, event_data); + } + } +} diff --git a/include/exec/memory.h b/include/exec/memory.h index 03595e3..8350973 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -26,6 +26,7 @@ #include "qom/object.h" #include "qemu/rcu.h" #include "hw/qdev-core.h" +#include "hw/core/iommu.h" #define RAM_ADDR_INVALID (~(ram_addr_t)0) @@ -301,6 +302,19 @@ struct MemoryListener { }; /** + * AddressSpaceOps: callbacks structure for address space specific operations + * + * @iommu_get: returns an IOMMU object that backs the address space. + * Normally this should be NULL for generic address + * spaces, and it's only used when there is one + * translation unit behind this address space. + */ +struct AddressSpaceOps { + IOMMUObject *(*iommu_get)(AddressSpace *as); +}; +typedef struct AddressSpaceOps AddressSpaceOps; + +/** * AddressSpace: describes a mapping of addresses to #MemoryRegion objects */ struct AddressSpace { @@ -316,6 +330,7 @@ struct AddressSpace { struct MemoryRegionIoeventfd *ioeventfds; QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; + AddressSpaceOps as_ops; }; FlatView *address_space_to_flatview(AddressSpace *as); @@ -1988,6 +2003,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len); } +/** + * address_space_iommu_get: Get the backend IOMMU for the address space + * + * @as: the address space to fetch IOMMU from + */ +IOMMUObject *address_space_iommu_get(AddressSpace *as); + #endif #endif diff --git a/include/hw/core/iommu.h b/include/hw/core/iommu.h new file mode 100644 index 0000000..34387c0 --- /dev/null +++ b/include/hw/core/iommu.h @@ -0,0 +1,73 @@ +/* + * QEMU emulation of IOMMU logic + * + * Copyright (C) 2017 Red Hat Inc. + * + * Authors: Peter Xu <peterx@redhat.com>, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_CORE_IOMMU_H +#define HW_CORE_IOMMU_H + +#include "qemu/queue.h" + +enum IOMMUEvent { + IOMMU_EVENT_BIND_PASIDT, +}; +typedef enum IOMMUEvent IOMMUEvent; + +struct IOMMUEventData { + IOMMUEvent event; + uint64_t length; + void *data; +}; +typedef struct IOMMUEventData IOMMUEventData; + +typedef struct IOMMUNotifier IOMMUNotifier; + +typedef void (*IOMMUNotifyFn)(IOMMUNotifier *notifier, + IOMMUEventData *event_data); + +struct IOMMUNotifier { + IOMMUNotifyFn iommu_notify; + /* + * What events we are listening to. Let's allow multiple event + * registrations from beginning. + */ + IOMMUEvent event; + QLIST_ENTRY(IOMMUNotifier) node; +}; + +typedef struct IOMMUObject IOMMUObject; + +/* + * This stands for an IOMMU unit. Any translation device should have + * this struct inside its own structure to make sure it can leverage + * common IOMMU functionalities. + */ +struct IOMMUObject { + QLIST_HEAD(, IOMMUNotifier) iommu_notifiers; +}; + +void iommu_notifier_register(IOMMUObject *iommu, + IOMMUNotifier *n, + IOMMUNotifyFn fn, + IOMMUEvent event); +void iommu_notifier_unregister(IOMMUObject *iommu, + IOMMUNotifier *notifier); +void iommu_notify(IOMMUObject *iommu, IOMMUEventData *event_data); + +#endif diff --git a/memory.c b/memory.c index 77fb3ef..307f665 100644 --- a/memory.c +++ b/memory.c @@ -235,8 +235,6 @@ struct FlatView { MemoryRegion *root; }; -typedef struct AddressSpaceOps AddressSpaceOps; - #define FOR_EACH_FLAT_RANGE(var, view) \ for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) @@ -2793,6 +2791,14 @@ static void do_address_space_destroy(AddressSpace *as) memory_region_unref(as->root); } +IOMMUObject *address_space_iommu_get(AddressSpace *as) +{ + if (!as->as_ops.iommu_get) { + return NULL; + } + return as->as_ops.iommu_get(as); +} + void address_space_destroy(AddressSpace *as) { MemoryRegion *root = as->root;