diff mbox series

[QEMU,v3,5/7] memory: add MemoryRegion map and unmap callbacks

Message ID 20240227223501.28475-6-vikram.garhwal@amd.com (mailing list archive)
State New, archived
Headers show
Series Xen: support grant mappings. | expand

Commit Message

Vikram Garhwal Feb. 27, 2024, 10:34 p.m. UTC
From: Juergen Gross <jgross@suse.com>

In order to support mapping and unmapping guest memory dynamically to
and from qemu during address_space_[un]map() operations add the map()
and unmap() callbacks to MemoryRegionOps.

Those will be used e.g. for Xen grant mappings when performing guest
I/Os.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
---
 include/exec/memory.h | 21 ++++++++++++++++++
 system/physmem.c      | 50 +++++++++++++++++++++++++++++++++----------
 2 files changed, 60 insertions(+), 11 deletions(-)

Comments

Stefano Stabellini Feb. 29, 2024, 11:10 p.m. UTC | #1
On Tue, 27 Feb 2024, Vikram Garhwal wrote:
> From: Juergen Gross <jgross@suse.com>
> 
> In order to support mapping and unmapping guest memory dynamically to
> and from qemu during address_space_[un]map() operations add the map()
> and unmap() callbacks to MemoryRegionOps.
> 
> Those will be used e.g. for Xen grant mappings when performing guest
> I/Os.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>
> Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>

Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>


> ---
>  include/exec/memory.h | 21 ++++++++++++++++++
>  system/physmem.c      | 50 +++++++++++++++++++++++++++++++++----------
>  2 files changed, 60 insertions(+), 11 deletions(-)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 8626a355b3..9f7dfe59c7 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -282,6 +282,27 @@ struct MemoryRegionOps {
>                                      unsigned size,
>                                      MemTxAttrs attrs);
>  
> +    /*
> +     * Dynamically create mapping. @addr is the guest address to map; @plen
> +     * is the pointer to the usable length of the buffer.
> +     * @mr contents can be changed in case a new memory region is created for
> +     * the mapping.
> +     * Returns the buffer address for accessing the data.
> +     */
> +    void *(*map)(MemoryRegion **mr,
> +                 hwaddr addr,
> +                 hwaddr *plen,
> +                 bool is_write,
> +                 MemTxAttrs attrs);
> +
> +    /* Unmap an area obtained via map() before. */
> +    void (*unmap)(MemoryRegion *mr,
> +                  void *buffer,
> +                  ram_addr_t addr,
> +                  hwaddr len,
> +                  bool is_write,
> +                  hwaddr access_len);
> +
>      enum device_endian endianness;
>      /* Guest-visible constraints: */
>      struct {
> diff --git a/system/physmem.c b/system/physmem.c
> index 949dcb20ba..d989e9fc1f 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as,
>      hwaddr len = *plen;
>      hwaddr l, xlat;
>      MemoryRegion *mr;
> +    void *ptr = NULL;
>      FlatView *fv;
>  
>      if (len == 0) {
> @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as,
>          return bounce.buffer;
>      }
>  
> -
>      memory_region_ref(mr);
> +
> +    if (mr->ops && mr->ops->map) {
> +        ptr = mr->ops->map(&mr, addr, plen, is_write, attrs);
> +    }
> +
>      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
>                                          l, is_write, attrs);
>      fuzz_dma_read_cb(addr, *plen, mr);
> -    return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> +    if (ptr == NULL) {
> +        ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> +    }
> +
> +    return ptr;
>  }
>  
>  /* Unmaps a memory region previously mapped by address_space_map().
> @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>  
>          mr = memory_region_from_host(buffer, &addr1);
>          assert(mr != NULL);
> -        if (is_write) {
> -            invalidate_and_set_dirty(mr, addr1, access_len);
> -        }
> -        if (xen_enabled()) {
> -            xen_invalidate_map_cache_entry(buffer);
> +
> +        if (mr->ops && mr->ops->unmap) {
> +            mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len);
> +        } else {
> +            if (is_write) {
> +                invalidate_and_set_dirty(mr, addr1, access_len);
> +            }
> +            if (xen_enabled()) {
> +                xen_invalidate_map_cache_entry(buffer);
> +            }
>          }
>          memory_region_unref(mr);
>          return;
> @@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache *cache,
>           * doing this if we found actual RAM, which behaves the same
>           * regardless of attributes; so UNSPECIFIED is fine.
>           */
> +        if (mr->ops && mr->ops->map) {
> +            cache->ptr = mr->ops->map(&mr, addr, &l, is_write,
> +                                      MEMTXATTRS_UNSPECIFIED);
> +        }
> +
>          l = flatview_extend_translation(cache->fv, addr, len, mr,
>                                          cache->xlat, l, is_write,
>                                          MEMTXATTRS_UNSPECIFIED);
> -        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
> +        if (!cache->ptr) {
> +            cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l,
> +                                             true);
> +        }
>      } else {
>          cache->ptr = NULL;
>      }
> @@ -3297,14 +3319,20 @@ void address_space_cache_invalidate(MemoryRegionCache *cache,
>  
>  void address_space_cache_destroy(MemoryRegionCache *cache)
>  {
> -    if (!cache->mrs.mr) {
> +    MemoryRegion *mr = cache->mrs.mr;
> +
> +    if (!mr) {
>          return;
>      }
>  
> -    if (xen_enabled()) {
> +    if (mr->ops && mr->ops->unmap) {
> +            mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len,
> +                           cache->is_write, cache->len);
> +    } else if (xen_enabled()) {
>          xen_invalidate_map_cache_entry(cache->ptr);
>      }
> -    memory_region_unref(cache->mrs.mr);
> +
> +    memory_region_unref(mr);
>      flatview_unref(cache->fv);
>      cache->mrs.mr = NULL;
>      cache->fv = NULL;
> -- 
> 2.17.1
>
Edgar E. Iglesias April 10, 2024, 11:16 a.m. UTC | #2
On Fri, Mar 1, 2024 at 12:11 AM Stefano Stabellini <sstabellini@kernel.org>
wrote:

> On Tue, 27 Feb 2024, Vikram Garhwal wrote:
> > From: Juergen Gross <jgross@suse.com>
> >
> > In order to support mapping and unmapping guest memory dynamically to
> > and from qemu during address_space_[un]map() operations add the map()
> > and unmap() callbacks to MemoryRegionOps.
> >
> > Those will be used e.g. for Xen grant mappings when performing guest
> > I/Os.
> >
> > Signed-off-by: Juergen Gross <jgross@suse.com>
> > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
>
> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
>

Reviewed-by: Edgar E. Iglesias <edgar.iglesias@amd.com>



> ---
> >  include/exec/memory.h | 21 ++++++++++++++++++
> >  system/physmem.c      | 50 +++++++++++++++++++++++++++++++++----------
> >  2 files changed, 60 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 8626a355b3..9f7dfe59c7 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -282,6 +282,27 @@ struct MemoryRegionOps {
> >                                      unsigned size,
> >                                      MemTxAttrs attrs);
> >
> > +    /*
> > +     * Dynamically create mapping. @addr is the guest address to map;
> @plen
> > +     * is the pointer to the usable length of the buffer.
> > +     * @mr contents can be changed in case a new memory region is
> created for
> > +     * the mapping.
> > +     * Returns the buffer address for accessing the data.
> > +     */
> > +    void *(*map)(MemoryRegion **mr,
> > +                 hwaddr addr,
> > +                 hwaddr *plen,
> > +                 bool is_write,
> > +                 MemTxAttrs attrs);
> > +
> > +    /* Unmap an area obtained via map() before. */
> > +    void (*unmap)(MemoryRegion *mr,
> > +                  void *buffer,
> > +                  ram_addr_t addr,
> > +                  hwaddr len,
> > +                  bool is_write,
> > +                  hwaddr access_len);
> > +
> >      enum device_endian endianness;
> >      /* Guest-visible constraints: */
> >      struct {
> > diff --git a/system/physmem.c b/system/physmem.c
> > index 949dcb20ba..d989e9fc1f 100644
> > --- a/system/physmem.c
> > +++ b/system/physmem.c
> > @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as,
> >      hwaddr len = *plen;
> >      hwaddr l, xlat;
> >      MemoryRegion *mr;
> > +    void *ptr = NULL;
> >      FlatView *fv;
> >
> >      if (len == 0) {
> > @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as,
> >          return bounce.buffer;
> >      }
> >
> > -
> >      memory_region_ref(mr);
> > +
> > +    if (mr->ops && mr->ops->map) {
> > +        ptr = mr->ops->map(&mr, addr, plen, is_write, attrs);
> > +    }
> > +
> >      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
> >                                          l, is_write, attrs);
> >      fuzz_dma_read_cb(addr, *plen, mr);
> > -    return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> > +    if (ptr == NULL) {
> > +        ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> > +    }
> > +
> > +    return ptr;
> >  }
> >
> >  /* Unmaps a memory region previously mapped by address_space_map().
> > @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void
> *buffer, hwaddr len,
> >
> >          mr = memory_region_from_host(buffer, &addr1);
> >          assert(mr != NULL);
> > -        if (is_write) {
> > -            invalidate_and_set_dirty(mr, addr1, access_len);
> > -        }
> > -        if (xen_enabled()) {
> > -            xen_invalidate_map_cache_entry(buffer);
> > +
> > +        if (mr->ops && mr->ops->unmap) {
> > +            mr->ops->unmap(mr, buffer, addr1, len, is_write,
> access_len);
> > +        } else {
> > +            if (is_write) {
> > +                invalidate_and_set_dirty(mr, addr1, access_len);
> > +            }
> > +            if (xen_enabled()) {
> > +                xen_invalidate_map_cache_entry(buffer);
> > +            }
> >          }
> >          memory_region_unref(mr);
> >          return;
> > @@ -3272,10 +3286,18 @@ int64_t
> address_space_cache_init(MemoryRegionCache *cache,
> >           * doing this if we found actual RAM, which behaves the same
> >           * regardless of attributes; so UNSPECIFIED is fine.
> >           */
> > +        if (mr->ops && mr->ops->map) {
> > +            cache->ptr = mr->ops->map(&mr, addr, &l, is_write,
> > +                                      MEMTXATTRS_UNSPECIFIED);
> > +        }
> > +
> >          l = flatview_extend_translation(cache->fv, addr, len, mr,
> >                                          cache->xlat, l, is_write,
> >                                          MEMTXATTRS_UNSPECIFIED);
> > -        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat,
> &l, true);
> > +        if (!cache->ptr) {
> > +            cache->ptr = qemu_ram_ptr_length(mr->ram_block,
> cache->xlat, &l,
> > +                                             true);
> > +        }
> >      } else {
> >          cache->ptr = NULL;
> >      }
> > @@ -3297,14 +3319,20 @@ void
> address_space_cache_invalidate(MemoryRegionCache *cache,
> >
> >  void address_space_cache_destroy(MemoryRegionCache *cache)
> >  {
> > -    if (!cache->mrs.mr) {
> > +    MemoryRegion *mr = cache->mrs.mr;
> > +
> > +    if (!mr) {
> >          return;
> >      }
> >
> > -    if (xen_enabled()) {
> > +    if (mr->ops && mr->ops->unmap) {
> > +            mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len,
> > +                           cache->is_write, cache->len);
> > +    } else if (xen_enabled()) {
> >          xen_invalidate_map_cache_entry(cache->ptr);
> >      }
> > -    memory_region_unref(cache->mrs.mr);
> > +
> > +    memory_region_unref(mr);
> >      flatview_unref(cache->fv);
> >      cache->mrs.mr = NULL;
> >      cache->fv = NULL;
> > --
> > 2.17.1
> >
>
>
Edgar E. Iglesias April 10, 2024, 4:44 p.m. UTC | #3
On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
wrote:

> From: Juergen Gross <jgross@suse.com>
>
> In order to support mapping and unmapping guest memory dynamically to
> and from qemu during address_space_[un]map() operations add the map()
> and unmap() callbacks to MemoryRegionOps.
>
> Those will be used e.g. for Xen grant mappings when performing guest
> I/Os.
>
> Signed-off-by: Juergen Gross <jgross@suse.com>
> Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
>


Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?

Cheers,
Edgar



> ---
>  include/exec/memory.h | 21 ++++++++++++++++++
>  system/physmem.c      | 50 +++++++++++++++++++++++++++++++++----------
>  2 files changed, 60 insertions(+), 11 deletions(-)
>
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 8626a355b3..9f7dfe59c7 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -282,6 +282,27 @@ struct MemoryRegionOps {
>                                      unsigned size,
>                                      MemTxAttrs attrs);
>
> +    /*
> +     * Dynamically create mapping. @addr is the guest address to map;
> @plen
> +     * is the pointer to the usable length of the buffer.
> +     * @mr contents can be changed in case a new memory region is created
> for
> +     * the mapping.
> +     * Returns the buffer address for accessing the data.
> +     */
> +    void *(*map)(MemoryRegion **mr,
> +                 hwaddr addr,
> +                 hwaddr *plen,
> +                 bool is_write,
> +                 MemTxAttrs attrs);
> +
> +    /* Unmap an area obtained via map() before. */
> +    void (*unmap)(MemoryRegion *mr,
> +                  void *buffer,
> +                  ram_addr_t addr,
> +                  hwaddr len,
> +                  bool is_write,
> +                  hwaddr access_len);
> +
>      enum device_endian endianness;
>      /* Guest-visible constraints: */
>      struct {
> diff --git a/system/physmem.c b/system/physmem.c
> index 949dcb20ba..d989e9fc1f 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as,
>      hwaddr len = *plen;
>      hwaddr l, xlat;
>      MemoryRegion *mr;
> +    void *ptr = NULL;
>      FlatView *fv;
>
>      if (len == 0) {
> @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as,
>          return bounce.buffer;
>      }
>
> -
>      memory_region_ref(mr);
> +
> +    if (mr->ops && mr->ops->map) {
> +        ptr = mr->ops->map(&mr, addr, plen, is_write, attrs);
> +    }
> +
>      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
>                                          l, is_write, attrs);
>      fuzz_dma_read_cb(addr, *plen, mr);
> -    return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> +    if (ptr == NULL) {
> +        ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
> +    }
> +
> +    return ptr;
>  }
>
>  /* Unmaps a memory region previously mapped by address_space_map().
> @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void
> *buffer, hwaddr len,
>
>          mr = memory_region_from_host(buffer, &addr1);
>          assert(mr != NULL);
> -        if (is_write) {
> -            invalidate_and_set_dirty(mr, addr1, access_len);
> -        }
> -        if (xen_enabled()) {
> -            xen_invalidate_map_cache_entry(buffer);
> +
> +        if (mr->ops && mr->ops->unmap) {
> +            mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len);
> +        } else {
> +            if (is_write) {
> +                invalidate_and_set_dirty(mr, addr1, access_len);
> +            }
> +            if (xen_enabled()) {
> +                xen_invalidate_map_cache_entry(buffer);
> +            }
>          }
>          memory_region_unref(mr);
>          return;
> @@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache
> *cache,
>           * doing this if we found actual RAM, which behaves the same
>           * regardless of attributes; so UNSPECIFIED is fine.
>           */
> +        if (mr->ops && mr->ops->map) {
> +            cache->ptr = mr->ops->map(&mr, addr, &l, is_write,
> +                                      MEMTXATTRS_UNSPECIFIED);
> +        }
> +
>          l = flatview_extend_translation(cache->fv, addr, len, mr,
>                                          cache->xlat, l, is_write,
>                                          MEMTXATTRS_UNSPECIFIED);
> -        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l,
> true);
> +        if (!cache->ptr) {
> +            cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat,
> &l,
> +                                             true);
> +        }
>      } else {
>          cache->ptr = NULL;
>      }
> @@ -3297,14 +3319,20 @@ void
> address_space_cache_invalidate(MemoryRegionCache *cache,
>
>  void address_space_cache_destroy(MemoryRegionCache *cache)
>  {
> -    if (!cache->mrs.mr) {
> +    MemoryRegion *mr = cache->mrs.mr;
> +
> +    if (!mr) {
>          return;
>      }
>
> -    if (xen_enabled()) {
> +    if (mr->ops && mr->ops->unmap) {
> +            mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len,
> +                           cache->is_write, cache->len);
> +    } else if (xen_enabled()) {
>          xen_invalidate_map_cache_entry(cache->ptr);
>      }
> -    memory_region_unref(cache->mrs.mr);
> +
> +    memory_region_unref(mr);
>      flatview_unref(cache->fv);
>      cache->mrs.mr = NULL;
>      cache->fv = NULL;
> --
> 2.17.1
>
>
>
Peter Xu April 10, 2024, 6:56 p.m. UTC | #4
On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote:
> On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
> wrote:
> 
> > From: Juergen Gross <jgross@suse.com>
> >
> > In order to support mapping and unmapping guest memory dynamically to
> > and from qemu during address_space_[un]map() operations add the map()
> > and unmap() callbacks to MemoryRegionOps.
> >
> > Those will be used e.g. for Xen grant mappings when performing guest
> > I/Os.
> >
> > Signed-off-by: Juergen Gross <jgross@suse.com>
> > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
> >
> 
> 
> Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?

This introduces a 3rd memory type afaict, neither direct nor !direct.

What happens if someone does address_space_write() to it?  I didn't see it
covered here..

OTOH, the cover letter didn't mention too much either on the big picture:

https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/

I want to have a quick grasp on whether it's justified worthwhile we should
introduce this complexity to qemu memory core.

Could I request a better cover letter when repost?  It'll be great to
mention things like:

  - what is grant mapping, why it needs to be used, when it can be used (is
    it only relevant to vIOMMU=on)?  Some more information on the high
    level design using this type or MR would be great.

  - why a 3rd memory type is required?  Do we have other alternatives?

    So it's all based on my very limited understanding of reading this:
    https://xenbits.xenproject.org/docs/4.3-testing/misc/grant-tables.txt

    If it's about cross-vm sharing memory, does it mean that in reality
    there are RAMs allocated, but it's only about permission management?
    In that case, is there any option we implement it with direct access
    mode (however with some extra dynamic permissions applied on top using
    some mechanism)?

  - perhaps sold old links would be great too so people can read about the
    context when it's not obvious, without a need to copy-paste.

Thanks,
Edgar E. Iglesias April 16, 2024, 11:32 a.m. UTC | #5
On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote:
> > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
> > wrote:
> >
> > > From: Juergen Gross <jgross@suse.com>
> > >
> > > In order to support mapping and unmapping guest memory dynamically to
> > > and from qemu during address_space_[un]map() operations add the map()
> > > and unmap() callbacks to MemoryRegionOps.
> > >
> > > Those will be used e.g. for Xen grant mappings when performing guest
> > > I/Os.
> > >
> > > Signed-off-by: Juergen Gross <jgross@suse.com>
> > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
> > >
> >
> >
> > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?
>

Thanks for your comments Peter,


> This introduces a 3rd memory type afaict, neither direct nor !direct.
>
> What happens if someone does address_space_write() to it?  I didn't see it
> covered here..

You're right, that won't work, the memory needs to be mapped before it
can be used.
At minimum there should be some graceful failure, right now this will crash.

>
> OTOH, the cover letter didn't mention too much either on the big picture:
>
> https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/
>
> I want to have a quick grasp on whether it's justified worthwhile we should
> introduce this complexity to qemu memory core.
>
> Could I request a better cover letter when repost?  It'll be great to
> mention things like:

I'll do that, but also answer inline in the meantime since we should
perhaps change the approach.

>
>   - what is grant mapping, why it needs to be used, when it can be used (is
>     it only relevant to vIOMMU=on)?  Some more information on the high
>     level design using this type or MR would be great.

https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt

Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM.

There's basically two mechanisms for QEMU's Virtio backends to access
the guest's RAM.
1. Foreign mappings. This gives the VM running QEMU access to the
entire RAM of the guest VM.
2. Grant mappings. This allows the guest to dynamically grant and
remove access to pages as needed.
So the VM running QEMU, cannot map guest RAM unless it's been
instructed to do so by the guest.

#2 is desirable because if QEMU gets compromised it has a smaller
attack surface onto the guest.

>
>   - why a 3rd memory type is required?  Do we have other alternatives?

Yes, there are alternatives.

1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*.
This would be less intrusive but perhaps also less explicit.
Concerns about touching the Memory API have been raised before, so
perhaps we should try this.
I'm a little unsure how we would deal with unmapping when the guest
removes our grants and we're using models that don't map but use
address_space_read/write().

2. Another approach could be to change the Xen grant-iommu in the
Linux kernel to work with a grant vIOMMU in QEMU.
Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions.
This would have the benefit that we wouldn't need to allocate
address-bit 63 for grants.
A drawback is that it may be slower since we're going to need to
bounce between guest/qemu a bit more.


>     So it's all based on my very limited understanding of reading this:
>     https://xenbits.xenproject.org/docs/4.3-testing/misc/grant-tables.txt
>
>     If it's about cross-vm sharing memory, does it mean that in reality
>     there are RAMs allocated, but it's only about permission management?
>     In that case, is there any option we implement it with direct access
>     mode (however with some extra dynamic permissions applied on top using
>     some mechanism)?

Yes, I think the grant vIOMMU approach would fall into this category.

>
>   - perhaps sold old links would be great too so people can read about the
>     context when it's not obvious, without a need to copy-paste.

https://wiki.xenproject.org/wiki/Virtio_On_Xen
https://lwn.net/Articles/896938/

Cheers,
Edgar
Jürgen Groß April 16, 2024, 1:28 p.m. UTC | #6
On 16.04.24 13:32, Edgar E. Iglesias wrote:
> On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote:
>>
>> On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote:
>>> On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
>>> wrote:
>>>
>>>> From: Juergen Gross <jgross@suse.com>
>>>>
>>>> In order to support mapping and unmapping guest memory dynamically to
>>>> and from qemu during address_space_[un]map() operations add the map()
>>>> and unmap() callbacks to MemoryRegionOps.
>>>>
>>>> Those will be used e.g. for Xen grant mappings when performing guest
>>>> I/Os.
>>>>
>>>> Signed-off-by: Juergen Gross <jgross@suse.com>
>>>> Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
>>>>
>>>
>>>
>>> Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?
>>
> 
> Thanks for your comments Peter,
> 
> 
>> This introduces a 3rd memory type afaict, neither direct nor !direct.
>>
>> What happens if someone does address_space_write() to it?  I didn't see it
>> covered here..
> 
> You're right, that won't work, the memory needs to be mapped before it
> can be used.
> At minimum there should be some graceful failure, right now this will crash.
> 
>>
>> OTOH, the cover letter didn't mention too much either on the big picture:
>>
>> https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/
>>
>> I want to have a quick grasp on whether it's justified worthwhile we should
>> introduce this complexity to qemu memory core.
>>
>> Could I request a better cover letter when repost?  It'll be great to
>> mention things like:
> 
> I'll do that, but also answer inline in the meantime since we should
> perhaps change the approach.
> 
>>
>>    - what is grant mapping, why it needs to be used, when it can be used (is
>>      it only relevant to vIOMMU=on)?  Some more information on the high
>>      level design using this type or MR would be great.
> 
> https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt
> 
> Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM.
> 
> There's basically two mechanisms for QEMU's Virtio backends to access
> the guest's RAM.
> 1. Foreign mappings. This gives the VM running QEMU access to the
> entire RAM of the guest VM.

Additionally it requires qemu to run in dom0, while in general Xen allows
to run backends in less privileged "driver domains", which are usually not
allowed to perform foreign mappings.

> 2. Grant mappings. This allows the guest to dynamically grant and
> remove access to pages as needed.
> So the VM running QEMU, cannot map guest RAM unless it's been
> instructed to do so by the guest.
> 
> #2 is desirable because if QEMU gets compromised it has a smaller
> attack surface onto the guest.

And it allows to run the virtio backend in a less privileged VM.

> 
>>
>>    - why a 3rd memory type is required?  Do we have other alternatives?
> 
> Yes, there are alternatives.
> 
> 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*.
> This would be less intrusive but perhaps also less explicit.
> Concerns about touching the Memory API have been raised before, so
> perhaps we should try this.
> I'm a little unsure how we would deal with unmapping when the guest
> removes our grants and we're using models that don't map but use
> address_space_read/write().

Those would either need to use grant-copy hypercalls, or they'd need to map,
read/write, unmap.

> 
> 2. Another approach could be to change the Xen grant-iommu in the
> Linux kernel to work with a grant vIOMMU in QEMU.
> Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions.
> This would have the benefit that we wouldn't need to allocate
> address-bit 63 for grants.
> A drawback is that it may be slower since we're going to need to
> bounce between guest/qemu a bit more.

It would be a _lot_ slower, unless virtio-iommu and grants are both modified
to match. I have looked into that, but the needed effort is rather large. At
the last Xen summit I have suggested to introduce a new grant format which
would work more like a normal page table structure. Using the same format
for virtio-iommu would allow to avoid the additional bounces between qemu and
the guest (and in fact that was one of the motivations to suggest the new
grant format).


Juergen
Peter Xu April 16, 2024, 3:55 p.m. UTC | #7
On Tue, Apr 16, 2024 at 03:28:41PM +0200, Jürgen Groß wrote:
> On 16.04.24 13:32, Edgar E. Iglesias wrote:
> > On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > 
> > > On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote:
> > > > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
> > > > wrote:
> > > > 
> > > > > From: Juergen Gross <jgross@suse.com>
> > > > > 
> > > > > In order to support mapping and unmapping guest memory dynamically to
> > > > > and from qemu during address_space_[un]map() operations add the map()
> > > > > and unmap() callbacks to MemoryRegionOps.
> > > > > 
> > > > > Those will be used e.g. for Xen grant mappings when performing guest
> > > > > I/Os.
> > > > > 
> > > > > Signed-off-by: Juergen Gross <jgross@suse.com>
> > > > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
> > > > > 
> > > > 
> > > > 
> > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?
> > > 
> > 
> > Thanks for your comments Peter,
> > 
> > 
> > > This introduces a 3rd memory type afaict, neither direct nor !direct.
> > > 
> > > What happens if someone does address_space_write() to it?  I didn't see it
> > > covered here..
> > 
> > You're right, that won't work, the memory needs to be mapped before it
> > can be used.
> > At minimum there should be some graceful failure, right now this will crash.
> > 
> > > 
> > > OTOH, the cover letter didn't mention too much either on the big picture:
> > > 
> > > https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/
> > > 
> > > I want to have a quick grasp on whether it's justified worthwhile we should
> > > introduce this complexity to qemu memory core.
> > > 
> > > Could I request a better cover letter when repost?  It'll be great to
> > > mention things like:
> > 
> > I'll do that, but also answer inline in the meantime since we should
> > perhaps change the approach.
> > 
> > > 
> > >    - what is grant mapping, why it needs to be used, when it can be used (is
> > >      it only relevant to vIOMMU=on)?  Some more information on the high
> > >      level design using this type or MR would be great.
> > 
> > https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt
> > 
> > Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM.
> > 
> > There's basically two mechanisms for QEMU's Virtio backends to access
> > the guest's RAM.
> > 1. Foreign mappings. This gives the VM running QEMU access to the
> > entire RAM of the guest VM.
> 
> Additionally it requires qemu to run in dom0, while in general Xen allows
> to run backends in less privileged "driver domains", which are usually not
> allowed to perform foreign mappings.
> 
> > 2. Grant mappings. This allows the guest to dynamically grant and
> > remove access to pages as needed.
> > So the VM running QEMU, cannot map guest RAM unless it's been
> > instructed to do so by the guest.
> > 
> > #2 is desirable because if QEMU gets compromised it has a smaller
> > attack surface onto the guest.
> 
> And it allows to run the virtio backend in a less privileged VM.
> 
> > 
> > > 
> > >    - why a 3rd memory type is required?  Do we have other alternatives?
> > 
> > Yes, there are alternatives.
> > 
> > 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*.
> > This would be less intrusive but perhaps also less explicit.
> > Concerns about touching the Memory API have been raised before, so
> > perhaps we should try this.
> > I'm a little unsure how we would deal with unmapping when the guest
> > removes our grants and we're using models that don't map but use
> > address_space_read/write().
> 
> Those would either need to use grant-copy hypercalls, or they'd need to map,
> read/write, unmap.
> 
> > 
> > 2. Another approach could be to change the Xen grant-iommu in the
> > Linux kernel to work with a grant vIOMMU in QEMU.
> > Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions.
> > This would have the benefit that we wouldn't need to allocate
> > address-bit 63 for grants.
> > A drawback is that it may be slower since we're going to need to
> > bounce between guest/qemu a bit more.
> 
> It would be a _lot_ slower, unless virtio-iommu and grants are both modified
> to match. I have looked into that, but the needed effort is rather large. At
> the last Xen summit I have suggested to introduce a new grant format which
> would work more like a normal page table structure. Using the same format
> for virtio-iommu would allow to avoid the additional bounces between qemu and
> the guest (and in fact that was one of the motivations to suggest the new
> grant format).

I have a better picture now, thanks both.

It really looks like an vIOMMU already to me, perhaps with a special refID
mapping playing similar roles as IOVAs in the rest IOMMU worlds.

I can't yet tell what's the best way for Xen - as of now QEMU's memory API
does provide such translations via IOMMUMemoryRegionClass.translate(), but
only from that.  So far it works for all vIOMMU emulations in QEMU, and I'd
hope we don't need to hack another memory type if possible for this,
especially if for performance's sake; more on below.

QEMU also suffers from similar issues with other kind of DMA protections,
at least that's what I'm aware with using either VT-d, SMMU, etc.. where
dynamic DMA mappings will slow the IOs down to a degree that it may not be
usable in real production.  We kept it like that and so far AFAIK we don't
yet have a solution for that simply because of the nature on how DMA
buffers are mapped and managed within a guest OS no matter Linux or not.

For Linux as a guest we basically suggest enabling iommu=pt so that kernel
drivers are trusted, and kernel driven devices can have full access to
guest RAMs by using the vIOMMU's passthrough mode. Perhaps similar to
foreign mappings for Xen, but maybe still different, as Xen's topology is
definitely special as a hypervisor here.

While for userspace drivers within the guest OS it'll always go through
vfio-pci now, which will enforce effective DMA mappings not the passthrough
mode. Then it's suggested to only map as less as possible, e.g. DPDK only
maps at the start of the user driver so it's mostly not affected by the
slowness of frequently changing DMA mappings.

I'm not sure whether above ideas would even be applicable for Xen, but I
just to share the status quo regarding to how we manage protected DMAs when
without Xen, just in case there's anything useful to help route the path
forward.

Thanks,
Edgar E. Iglesias April 17, 2024, 10:34 a.m. UTC | #8
On Tue, Apr 16, 2024 at 5:55 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Apr 16, 2024 at 03:28:41PM +0200, Jürgen Groß wrote:
> > On 16.04.24 13:32, Edgar E. Iglesias wrote:
> > > On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote:
> > > >
> > > > On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote:
> > > > > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com>
> > > > > wrote:
> > > > >
> > > > > > From: Juergen Gross <jgross@suse.com>
> > > > > >
> > > > > > In order to support mapping and unmapping guest memory dynamically to
> > > > > > and from qemu during address_space_[un]map() operations add the map()
> > > > > > and unmap() callbacks to MemoryRegionOps.
> > > > > >
> > > > > > Those will be used e.g. for Xen grant mappings when performing guest
> > > > > > I/Os.
> > > > > >
> > > > > > Signed-off-by: Juergen Gross <jgross@suse.com>
> > > > > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com>
> > > > > >
> > > > >
> > > > >
> > > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch?
> > > >
> > >
> > > Thanks for your comments Peter,
> > >
> > >
> > > > This introduces a 3rd memory type afaict, neither direct nor !direct.
> > > >
> > > > What happens if someone does address_space_write() to it?  I didn't see it
> > > > covered here..
> > >
> > > You're right, that won't work, the memory needs to be mapped before it
> > > can be used.
> > > At minimum there should be some graceful failure, right now this will crash.
> > >
> > > >
> > > > OTOH, the cover letter didn't mention too much either on the big picture:
> > > >
> > > > https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/
> > > >
> > > > I want to have a quick grasp on whether it's justified worthwhile we should
> > > > introduce this complexity to qemu memory core.
> > > >
> > > > Could I request a better cover letter when repost?  It'll be great to
> > > > mention things like:
> > >
> > > I'll do that, but also answer inline in the meantime since we should
> > > perhaps change the approach.
> > >
> > > >
> > > >    - what is grant mapping, why it needs to be used, when it can be used (is
> > > >      it only relevant to vIOMMU=on)?  Some more information on the high
> > > >      level design using this type or MR would be great.
> > >
> > > https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt
> > >
> > > Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM.
> > >
> > > There's basically two mechanisms for QEMU's Virtio backends to access
> > > the guest's RAM.
> > > 1. Foreign mappings. This gives the VM running QEMU access to the
> > > entire RAM of the guest VM.
> >
> > Additionally it requires qemu to run in dom0, while in general Xen allows
> > to run backends in less privileged "driver domains", which are usually not
> > allowed to perform foreign mappings.
> >
> > > 2. Grant mappings. This allows the guest to dynamically grant and
> > > remove access to pages as needed.
> > > So the VM running QEMU, cannot map guest RAM unless it's been
> > > instructed to do so by the guest.
> > >
> > > #2 is desirable because if QEMU gets compromised it has a smaller
> > > attack surface onto the guest.
> >
> > And it allows to run the virtio backend in a less privileged VM.
> >
> > >
> > > >
> > > >    - why a 3rd memory type is required?  Do we have other alternatives?
> > >
> > > Yes, there are alternatives.
> > >
> > > 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*.
> > > This would be less intrusive but perhaps also less explicit.
> > > Concerns about touching the Memory API have been raised before, so
> > > perhaps we should try this.
> > > I'm a little unsure how we would deal with unmapping when the guest
> > > removes our grants and we're using models that don't map but use
> > > address_space_read/write().
> >
> > Those would either need to use grant-copy hypercalls, or they'd need to map,
> > read/write, unmap.
> >
> > >
> > > 2. Another approach could be to change the Xen grant-iommu in the
> > > Linux kernel to work with a grant vIOMMU in QEMU.
> > > Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions.
> > > This would have the benefit that we wouldn't need to allocate
> > > address-bit 63 for grants.
> > > A drawback is that it may be slower since we're going to need to
> > > bounce between guest/qemu a bit more.
> >
> > It would be a _lot_ slower, unless virtio-iommu and grants are both modified
> > to match. I have looked into that, but the needed effort is rather large. At
> > the last Xen summit I have suggested to introduce a new grant format which
> > would work more like a normal page table structure. Using the same format
> > for virtio-iommu would allow to avoid the additional bounces between qemu and
> > the guest (and in fact that was one of the motivations to suggest the new
> > grant format).
>
> I have a better picture now, thanks both.
>
> It really looks like an vIOMMU already to me, perhaps with a special refID
> mapping playing similar roles as IOVAs in the rest IOMMU worlds.
>
> I can't yet tell what's the best way for Xen - as of now QEMU's memory API
> does provide such translations via IOMMUMemoryRegionClass.translate(), but
> only from that.  So far it works for all vIOMMU emulations in QEMU, and I'd
> hope we don't need to hack another memory type if possible for this,
> especially if for performance's sake; more on below.
>
> QEMU also suffers from similar issues with other kind of DMA protections,
> at least that's what I'm aware with using either VT-d, SMMU, etc.. where
> dynamic DMA mappings will slow the IOs down to a degree that it may not be
> usable in real production.  We kept it like that and so far AFAIK we don't
> yet have a solution for that simply because of the nature on how DMA
> buffers are mapped and managed within a guest OS no matter Linux or not.
>
> For Linux as a guest we basically suggest enabling iommu=pt so that kernel
> drivers are trusted, and kernel driven devices can have full access to
> guest RAMs by using the vIOMMU's passthrough mode. Perhaps similar to
> foreign mappings for Xen, but maybe still different, as Xen's topology is
> definitely special as a hypervisor here.
>
> While for userspace drivers within the guest OS it'll always go through
> vfio-pci now, which will enforce effective DMA mappings not the passthrough
> mode. Then it's suggested to only map as less as possible, e.g. DPDK only
> maps at the start of the user driver so it's mostly not affected by the
> slowness of frequently changing DMA mappings.
>
> I'm not sure whether above ideas would even be applicable for Xen, but I
> just to share the status quo regarding to how we manage protected DMAs when
> without Xen, just in case there's anything useful to help route the path
> forward.
>
> Thanks,
>
> --
> Peter Xu
>

Thanks for the suggestions Peter and for your comments Jurgen.
We'll have to evaluate the different approaches a little more and see
where we go from here.

Best regards,
Edgar
diff mbox series

Patch

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 8626a355b3..9f7dfe59c7 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -282,6 +282,27 @@  struct MemoryRegionOps {
                                     unsigned size,
                                     MemTxAttrs attrs);
 
+    /*
+     * Dynamically create mapping. @addr is the guest address to map; @plen
+     * is the pointer to the usable length of the buffer.
+     * @mr contents can be changed in case a new memory region is created for
+     * the mapping.
+     * Returns the buffer address for accessing the data.
+     */
+    void *(*map)(MemoryRegion **mr,
+                 hwaddr addr,
+                 hwaddr *plen,
+                 bool is_write,
+                 MemTxAttrs attrs);
+
+    /* Unmap an area obtained via map() before. */
+    void (*unmap)(MemoryRegion *mr,
+                  void *buffer,
+                  ram_addr_t addr,
+                  hwaddr len,
+                  bool is_write,
+                  hwaddr access_len);
+
     enum device_endian endianness;
     /* Guest-visible constraints: */
     struct {
diff --git a/system/physmem.c b/system/physmem.c
index 949dcb20ba..d989e9fc1f 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3141,6 +3141,7 @@  void *address_space_map(AddressSpace *as,
     hwaddr len = *plen;
     hwaddr l, xlat;
     MemoryRegion *mr;
+    void *ptr = NULL;
     FlatView *fv;
 
     if (len == 0) {
@@ -3174,12 +3175,20 @@  void *address_space_map(AddressSpace *as,
         return bounce.buffer;
     }
 
-
     memory_region_ref(mr);
+
+    if (mr->ops && mr->ops->map) {
+        ptr = mr->ops->map(&mr, addr, plen, is_write, attrs);
+    }
+
     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
                                         l, is_write, attrs);
     fuzz_dma_read_cb(addr, *plen, mr);
-    return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
+    if (ptr == NULL) {
+        ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
+    }
+
+    return ptr;
 }
 
 /* Unmaps a memory region previously mapped by address_space_map().
@@ -3195,11 +3204,16 @@  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
 
         mr = memory_region_from_host(buffer, &addr1);
         assert(mr != NULL);
-        if (is_write) {
-            invalidate_and_set_dirty(mr, addr1, access_len);
-        }
-        if (xen_enabled()) {
-            xen_invalidate_map_cache_entry(buffer);
+
+        if (mr->ops && mr->ops->unmap) {
+            mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len);
+        } else {
+            if (is_write) {
+                invalidate_and_set_dirty(mr, addr1, access_len);
+            }
+            if (xen_enabled()) {
+                xen_invalidate_map_cache_entry(buffer);
+            }
         }
         memory_region_unref(mr);
         return;
@@ -3272,10 +3286,18 @@  int64_t address_space_cache_init(MemoryRegionCache *cache,
          * doing this if we found actual RAM, which behaves the same
          * regardless of attributes; so UNSPECIFIED is fine.
          */
+        if (mr->ops && mr->ops->map) {
+            cache->ptr = mr->ops->map(&mr, addr, &l, is_write,
+                                      MEMTXATTRS_UNSPECIFIED);
+        }
+
         l = flatview_extend_translation(cache->fv, addr, len, mr,
                                         cache->xlat, l, is_write,
                                         MEMTXATTRS_UNSPECIFIED);
-        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
+        if (!cache->ptr) {
+            cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l,
+                                             true);
+        }
     } else {
         cache->ptr = NULL;
     }
@@ -3297,14 +3319,20 @@  void address_space_cache_invalidate(MemoryRegionCache *cache,
 
 void address_space_cache_destroy(MemoryRegionCache *cache)
 {
-    if (!cache->mrs.mr) {
+    MemoryRegion *mr = cache->mrs.mr;
+
+    if (!mr) {
         return;
     }
 
-    if (xen_enabled()) {
+    if (mr->ops && mr->ops->unmap) {
+            mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len,
+                           cache->is_write, cache->len);
+    } else if (xen_enabled()) {
         xen_invalidate_map_cache_entry(cache->ptr);
     }
-    memory_region_unref(cache->mrs.mr);
+
+    memory_region_unref(mr);
     flatview_unref(cache->fv);
     cache->mrs.mr = NULL;
     cache->fv = NULL;