diff mbox series

[V2,01/11] machine: alloc-anon option

Message ID 1719776434-435013-2-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New
Headers show
Series Live update: cpr-exec | expand

Commit Message

Steve Sistare June 30, 2024, 7:40 p.m. UTC
Allocate anonymous memory using mmap MAP_ANON or memfd_create depending
on the value of the anon-alloc machine property.  This affects
memory-backend-ram objects, guest RAM created with the global -m option
but without an associated memory-backend object and without the -mem-path
option, and various memory regions such as ROMs that are allocated when
devices are created.  This option does not affect memory-backend-file,
memory-backend-memfd, or memory-backend-epc objects.

The memfd option is intended to support new migration modes, in which the
memory region can be transferred in place to a new QEMU process, by sending
the memfd file descriptor to the process.  Memory contents are preserved,
and if the mode also transfers device descriptors, then pages that are
locked in memory for DMA remain locked.  This behavior is a pre-requisite
for supporting vfio, vdpa, and iommufd devices with the new modes.

To access the same memory in the old and new QEMU processes, the memory
must be mapped shared.  Therefore, the implementation always sets
RAM_SHARED if alloc-anon=memfd, except for memory-backend-ram, where the
user must explicitly specify the share option.  In lieu of defining a new
RAM flag, at the lowest level the implementation uses RAM_SHARED with fd=-1
as the condition for calling memfd_create.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/core/machine.c   | 24 ++++++++++++++++++++++++
 include/hw/boards.h |  1 +
 qapi/machine.json   | 14 ++++++++++++++
 qemu-options.hx     | 13 +++++++++++++
 system/memory.c     | 12 +++++++++---
 system/physmem.c    | 38 +++++++++++++++++++++++++++++++++++++-
 system/trace-events |  3 +++
 7 files changed, 101 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 655d75c..7ca2ad0 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -454,6 +454,20 @@  static void machine_set_mem_merge(Object *obj, bool value, Error **errp)
     ms->mem_merge = value;
 }
 
+static int machine_get_anon_alloc(Object *obj, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    return ms->anon_alloc;
+}
+
+static void machine_set_anon_alloc(Object *obj, int value, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    ms->anon_alloc = value;
+}
+
 static bool machine_get_usb(Object *obj, Error **errp)
 {
     MachineState *ms = MACHINE(obj);
@@ -1066,6 +1080,11 @@  static void machine_class_init(ObjectClass *oc, void *data)
     object_class_property_set_description(oc, "mem-merge",
         "Enable/disable memory merge support");
 
+    object_class_property_add_enum(oc, "anon-alloc", "AnonAllocOption",
+                                   &AnonAllocOption_lookup,
+                                   machine_get_anon_alloc,
+                                   machine_set_anon_alloc);
+
     object_class_property_add_bool(oc, "usb",
         machine_get_usb, machine_set_usb);
     object_class_property_set_description(oc, "usb",
@@ -1416,6 +1435,11 @@  static bool create_default_memdev(MachineState *ms, const char *path, Error **er
     if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {
         goto out;
     }
+    if (!object_property_set_bool(obj, "share",
+                                  ms->anon_alloc == ANON_ALLOC_OPTION_MEMFD,
+                                  errp)) {
+        goto out;
+    }
     object_property_add_child(object_get_objects_root(), mc->default_ram_id,
                               obj);
     /* Ensure backend's memory region name is equal to mc->default_ram_id */
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 73ad319..77f16ad 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -383,6 +383,7 @@  struct MachineState {
     bool enable_graphics;
     ConfidentialGuestSupport *cgs;
     HostMemoryBackend *memdev;
+    AnonAllocOption anon_alloc;
     /*
      * convenience alias to ram_memdev_id backend memory region
      * or to numa container memory region
diff --git a/qapi/machine.json b/qapi/machine.json
index 2fd3e9c..9173953 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -1881,3 +1881,17 @@ 
 { 'command': 'x-query-interrupt-controllers',
   'returns': 'HumanReadableText',
   'features': [ 'unstable' ]}
+
+##
+# @AnonAllocOption:
+#
+# An enumeration of the options for allocating anonymous guest memory.
+#
+# @mmap: allocate using mmap MAP_ANON
+#
+# @memfd: allocate using memfd_create
+#
+# Since: 9.1
+##
+{ 'enum': 'AnonAllocOption',
+  'data': [ 'mmap', 'memfd' ] }
diff --git a/qemu-options.hx b/qemu-options.hx
index 8ca7f34..595b693 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -38,6 +38,7 @@  DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
     "                nvdimm=on|off controls NVDIMM support (default=off)\n"
     "                memory-encryption=@var{} memory encryption object to use (default=none)\n"
     "                hmat=on|off controls ACPI HMAT support (default=off)\n"
+    "                anon-alloc=mmap|memfd allocate anonymous guest RAM using mmap MAP_ANON or memfd_create (default: mmap)\n"
     "                memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
     "                cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
     QEMU_ARCH_ALL)
@@ -101,6 +102,18 @@  SRST
         Enables or disables ACPI Heterogeneous Memory Attribute Table
         (HMAT) support. The default is off.
 
+    ``anon-alloc=mmap|memfd``
+        Allocate anonymous guest RAM using mmap MAP_ANON (the default)
+        or memfd_create.  This affects memory-backend-ram objects,
+        RAM created with the global -m option but without an
+        associated memory-backend object and without the -mem-path
+        option, and various memory regions such as ROMs that are
+        allocated when devices are created.  This option does not
+        affect memory-backend-file, memory-backend-memfd, or
+        memory-backend-epc objects.
+
+        Some migration modes require anon-alloc=memfd.
+
     ``memory-backend='id'``
         An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
         Allows to use a memory backend as main RAM.
diff --git a/system/memory.c b/system/memory.c
index 2d69521..28a837d 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -1552,8 +1552,10 @@  bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
                                       uint64_t size,
                                       Error **errp)
 {
+    uint32_t flags = (current_machine->anon_alloc == ANON_ALLOC_OPTION_MEMFD) ?
+                     RAM_SHARED : 0;
     return memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                  size, 0, errp);
+                                                  size, flags, errp);
 }
 
 bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
@@ -1713,8 +1715,10 @@  bool memory_region_init_rom_nomigrate(MemoryRegion *mr,
                                       uint64_t size,
                                       Error **errp)
 {
+    uint32_t flags = (current_machine->anon_alloc == ANON_ALLOC_OPTION_MEMFD) ?
+                     RAM_SHARED : 0;
     if (!memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                size, 0, errp)) {
+                                                size, flags, errp)) {
          return false;
     }
     mr->readonly = true;
@@ -1731,6 +1735,8 @@  bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
                                              Error **errp)
 {
     Error *err = NULL;
+    uint32_t flags = (current_machine->anon_alloc == ANON_ALLOC_OPTION_MEMFD) ?
+                     RAM_SHARED : 0;
     assert(ops);
     memory_region_init(mr, owner, name, size);
     mr->ops = ops;
@@ -1738,7 +1744,7 @@  bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
     mr->terminates = true;
     mr->rom_device = true;
     mr->destructor = memory_region_destructor_ram;
-    mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
+    mr->ram_block = qemu_ram_alloc(size, flags, mr, &err);
     if (err) {
         mr->size = int128_zero();
         object_unparent(OBJECT(mr));
diff --git a/system/physmem.c b/system/physmem.c
index 33d09f7..efe95ff 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -47,6 +47,7 @@ 
 #include "qemu/qemu-print.h"
 #include "qemu/log.h"
 #include "qemu/memalign.h"
+#include "qemu/memfd.h"
 #include "exec/memory.h"
 #include "exec/ioport.h"
 #include "sysemu/dma.h"
@@ -54,6 +55,7 @@ 
 #include "sysemu/hw_accel.h"
 #include "sysemu/xen-mapcache.h"
 #include "trace/trace-root.h"
+#include "trace.h"
 
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
 #include <linux/falloc.h>
@@ -69,6 +71,8 @@ 
 
 #include "qemu/pmem.h"
 
+#include "qapi/qapi-types-migration.h"
+#include "migration/options.h"
 #include "migration/vmstate.h"
 
 #include "qemu/range.h"
@@ -1828,6 +1832,32 @@  static void ram_block_add(RAMBlock *new_block, Error **errp)
                 qemu_mutex_unlock_ramlist();
                 return;
             }
+
+        } else if (new_block->flags & RAM_SHARED) {
+            size_t max_length = new_block->max_length;
+            MemoryRegion *mr = new_block->mr;
+            const char *name = memory_region_name(mr);
+
+            new_block->mr->align = QEMU_VMALLOC_ALIGN;
+
+            if (new_block->fd == -1) {
+                new_block->fd = qemu_memfd_create(name, max_length + mr->align,
+                                                  0, 0, 0, errp);
+            }
+
+            if (new_block->fd >= 0) {
+                int mfd = new_block->fd;
+                qemu_set_cloexec(mfd);
+                new_block->host = file_ram_alloc(new_block, max_length, mfd,
+                                                 false, 0, errp);
+            }
+            if (!new_block->host) {
+                qemu_mutex_unlock_ramlist();
+                return;
+            }
+            memory_try_enable_merging(new_block->host, new_block->max_length);
+            free_on_error = true;
+
         } else {
             new_block->host = qemu_anon_ram_alloc(new_block->max_length,
                                                   &new_block->mr->align,
@@ -1911,6 +1941,9 @@  static void ram_block_add(RAMBlock *new_block, Error **errp)
         ram_block_notify_add(new_block->host, new_block->used_length,
                              new_block->max_length);
     }
+    trace_ram_block_add(memory_region_name(new_block->mr), new_block->flags,
+                        new_block->fd, new_block->used_length,
+                        new_block->max_length);
     return;
 
 out_free:
@@ -2097,8 +2130,11 @@  RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
                                                      void *host),
                                      MemoryRegion *mr, Error **errp)
 {
+    uint32_t flags = (current_machine->anon_alloc == ANON_ALLOC_OPTION_MEMFD) ?
+                     RAM_SHARED : 0;
+    flags |= RAM_RESIZEABLE;
     return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
-                                   RAM_RESIZEABLE, mr, errp);
+                                   flags, mr, errp);
 }
 
 static void reclaim_ramblock(RAMBlock *block)
diff --git a/system/trace-events b/system/trace-events
index 69c9044..f8ebf42 100644
--- a/system/trace-events
+++ b/system/trace-events
@@ -38,3 +38,6 @@  dirtylimit_state_finalize(void)
 dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
 dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
 dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
+
+#physmem.c
+ram_block_add(const char *name, uint32_t flags, int fd, size_t used_length, size_t max_length) "%s, flags %u, fd %d, len %lu, maxlen %lu"