Message ID | 881871e8394fa18a656dfb105d42e6099335c721.1615972140.git.haibo.xu@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target/arm: Add MTE support to KVM guest | expand |
* Haibo Xu (haibo.xu@linaro.org) wrote: > To make it easier to keep the page tags sync with > the page data, tags for one page are appended to > the data during ram save iteration. > > This patch only add the pre-copy migration support. > Post-copy and compress as well as zero page saving > are not supported yet. > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> My guess is that this doesn't work with a lot of other options; e.g. postcopy and probably compression and a bunch of other things. Postcopy I can see you'll need some interesting kernel changes for - you'd need to be able to atomically place a page with it's tag data. You probably need to add stuff to migrate_caps_check to disable features that you don't support. > --- > include/hw/arm/virt.h | 2 + > include/migration/misc.h | 1 + > migration/ram.c | 86 +++++++++++++++++++++++++++++++++++++++- > 3 files changed, 88 insertions(+), 1 deletion(-) > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h > index 921416f918..8b28cde8bf 100644 > --- a/include/hw/arm/virt.h > +++ b/include/hw/arm/virt.h > @@ -166,6 +166,8 @@ struct VirtMachineState { > PCIBus *bus; > char *oem_id; > char *oem_table_id; > + /* migrate memory tags */ > + NotifierWithReturn precopy_notifier; > }; > > #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) > diff --git a/include/migration/misc.h b/include/migration/misc.h > index bccc1b6b44..005133f471 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n); > void precopy_remove_notifier(NotifierWithReturn *n); > int precopy_notify(PrecopyNotifyReason reason, Error **errp); > void precopy_enable_free_page_optimization(void); > +void precopy_enable_metadata_migration(void); > > void ram_mig_init(void); > void qemu_guest_free_page_hint(void *addr, size_t len); > diff --git a/migration/ram.c b/migration/ram.c > index 72143da0ac..e67b798c3b 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -53,10 +53,12 @@ > #include "block.h" > #include "sysemu/sysemu.h" > #include "sysemu/cpu-throttle.h" > +#include "sysemu/kvm.h" > #include "savevm.h" > #include "qemu/iov.h" > #include "multifd.h" > #include "sysemu/runstate.h" > +#include "kvm_arm.h" > > #if defined(__linux__) > #include "qemu/userfaultfd.h" > @@ -80,6 +82,9 @@ > #define RAM_SAVE_FLAG_XBZRLE 0x40 > /* 0x80 is reserved in migration.h start with 0x100 next */ > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > +#define RAM_SAVE_FLAG_MTE 0x200 I think that's using the last free bit in the flags field, which is a bit painful. > +#define MTE_GRANULE_SIZE (16) > > static inline bool is_zero_range(uint8_t *p, uint64_t size) > { > @@ -317,6 +322,8 @@ struct RAMState { > bool ram_bulk_stage; > /* The free page optimization is enabled */ > bool fpo_enabled; > + /* The RAM meta data(e.t memory tag) is enabled */ > + bool metadata_enabled; > /* How many times we have dirty too many pages */ > int dirty_rate_high_cnt; > /* these variables are used for bitmap sync */ > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void) > ram_state->fpo_enabled = true; > } > > +void precopy_enable_metadata_migration(void) > +{ > + if (!ram_state) { > + return; > + } > + > + ram_state->metadata_enabled = true; > +} > + > uint64_t ram_bytes_remaining(void) > { > return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > return true; > } > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > +{ > + uint8_t *tag_buf = NULL; > + uint64_t ipa; > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; This function needs to be mostly somewhere aarch specific; or somehow made more generic. We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with something else glued onto pages can do something similar. Try and keep migration/ architecture independent. > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > + /* Buffer for the page tags(one byte per tag) */ > + tag_buf = g_try_malloc0(size); It feels like you want to allocate tag_buf in setup and free it at the end rather than doing this in every page. > + if (!tag_buf) { > + error_report("%s: Error allocating MTE tag_buf", __func__); > + return 0; > + } > + > + if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > + error_report("%s: Can't get MTE tags from guest", __func__); For any error like this you probably want to say the addresses to make it easier to debug when it fails. > + g_free(tag_buf); > + return 0; > + } > + > + qemu_put_buffer(f, tag_buf, size); > + > + g_free(tag_buf); > + > + return size; > + } > + > + return 0; > +} > + > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > +{ > + uint8_t *tag_buf = NULL; > + uint64_t ipa; > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > + > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > + /* Buffer for the page tags(one byte per tag) */ > + tag_buf = g_try_malloc0(size); > + if (!tag_buf) { > + error_report("%s: Error allocating MTE tag_buf", __func__); > + return; > + } > + > + qemu_get_buffer(f, tag_buf, size); > + if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { what protections are there here to stop you setting the mte on something useful, like part of the host kernel or qemu? > + error_report("%s: Can't set MTE tags to guest", __func__); > + } > + > + g_free(tag_buf); > + } > + > + return; > +} > + > /* > * directly send the page to the stream > * > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > uint8_t *buf, bool async) > { > + if (rs->metadata_enabled) { > + offset |= RAM_SAVE_FLAG_MTE; > + } > + > ram_counters.transferred += save_page_header(rs, rs->f, block, > offset | RAM_SAVE_FLAG_PAGE); > if (async) { > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > } > ram_counters.transferred += TARGET_PAGE_SIZE; > ram_counters.normal++; > + > + if (rs->metadata_enabled) { > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > + } > + > return 1; > } > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > rs->last_version = ram_list.version; > rs->ram_bulk_stage = true; > rs->fpo_enabled = false; > + rs->metadata_enabled = false; > } > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > } > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { > case RAM_SAVE_FLAG_MEM_SIZE: > /* Synchronize RAM block list */ > total_ram_bytes = addr; > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > case RAM_SAVE_FLAG_PAGE: > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > + if (flags & RAM_SAVE_FLAG_MTE) { > + load_normal_page_mte_tags(f, host); > + } > break; > > case RAM_SAVE_FLAG_COMPRESS_PAGE: > -- > 2.17.1 >
On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > * Haibo Xu (haibo.xu@linaro.org) wrote: > > To make it easier to keep the page tags sync with > > the page data, tags for one page are appended to > > the data during ram save iteration. > > > > This patch only add the pre-copy migration support. > > Post-copy and compress as well as zero page saving > > are not supported yet. > > > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> > > My guess is that this doesn't work with a lot of other options; e.g. > postcopy and probably compression and a bunch of other things. > Postcopy I can see you'll need some interesting kernel changes for - > you'd need to be able to atomically place a page with it's tag data. > > You probably need to add stuff to migrate_caps_check to disable > features that you don't support. > Hi David, Thanks so much for the comments! You are right, this RFC patch only supports pre-copy mode, no postcopy, no compression. As a RFC, here just want to finalize the tag migration process, that is: 1. let the tag go with the page data(the current choice) which may be a little complex to put them into the current migration process. 2. migrate them separately which is easy to implement with the current migration(treat the tags as device status), but it would be hard to keep the page data and tag to sync with each other. 3. Any other ways? Once the tag migration process is finalized, a new formal patch series with postcopy as well as compression should be reworked. What's more, you mentioned that "some interesting kernel changes are needed to atomically place a page with it's tag data". You mean a single kernel API to store page data and tag in the migration load process? Regards, Haibo > > --- > > include/hw/arm/virt.h | 2 + > > include/migration/misc.h | 1 + > > migration/ram.c | 86 +++++++++++++++++++++++++++++++++++++++- > > 3 files changed, 88 insertions(+), 1 deletion(-) > > > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h > > index 921416f918..8b28cde8bf 100644 > > --- a/include/hw/arm/virt.h > > +++ b/include/hw/arm/virt.h > > @@ -166,6 +166,8 @@ struct VirtMachineState { > > PCIBus *bus; > > char *oem_id; > > char *oem_table_id; > > + /* migrate memory tags */ > > + NotifierWithReturn precopy_notifier; > > }; > > > > #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) > > diff --git a/include/migration/misc.h b/include/migration/misc.h > > index bccc1b6b44..005133f471 100644 > > --- a/include/migration/misc.h > > +++ b/include/migration/misc.h > > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n); > > void precopy_remove_notifier(NotifierWithReturn *n); > > int precopy_notify(PrecopyNotifyReason reason, Error **errp); > > void precopy_enable_free_page_optimization(void); > > +void precopy_enable_metadata_migration(void); > > > > void ram_mig_init(void); > > void qemu_guest_free_page_hint(void *addr, size_t len); > > diff --git a/migration/ram.c b/migration/ram.c > > index 72143da0ac..e67b798c3b 100644 > > --- a/migration/ram.c > > +++ b/migration/ram.c > > @@ -53,10 +53,12 @@ > > #include "block.h" > > #include "sysemu/sysemu.h" > > #include "sysemu/cpu-throttle.h" > > +#include "sysemu/kvm.h" > > #include "savevm.h" > > #include "qemu/iov.h" > > #include "multifd.h" > > #include "sysemu/runstate.h" > > +#include "kvm_arm.h" > > > > #if defined(__linux__) > > #include "qemu/userfaultfd.h" > > @@ -80,6 +82,9 @@ > > #define RAM_SAVE_FLAG_XBZRLE 0x40 > > /* 0x80 is reserved in migration.h start with 0x100 next */ > > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > > +#define RAM_SAVE_FLAG_MTE 0x200 > > I think that's using the last free bit in the flags field, which is a > bit painful. > Yes, only 0x80(reserved) and 0x200 are available. > > +#define MTE_GRANULE_SIZE (16) > > > > static inline bool is_zero_range(uint8_t *p, uint64_t size) > > { > > @@ -317,6 +322,8 @@ struct RAMState { > > bool ram_bulk_stage; > > /* The free page optimization is enabled */ > > bool fpo_enabled; > > + /* The RAM meta data(e.t memory tag) is enabled */ > > + bool metadata_enabled; > > /* How many times we have dirty too many pages */ > > int dirty_rate_high_cnt; > > /* these variables are used for bitmap sync */ > > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void) > > ram_state->fpo_enabled = true; > > } > > > > +void precopy_enable_metadata_migration(void) > > +{ > > + if (!ram_state) { > > + return; > > + } > > + > > + ram_state->metadata_enabled = true; > > +} > > + > > uint64_t ram_bytes_remaining(void) > > { > > return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : > > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > return true; > > } > > > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > +{ > > + uint8_t *tag_buf = NULL; > > + uint64_t ipa; > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > This function needs to be mostly somewhere aarch specific; or somehow > made more generic. > We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something > like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with > something else glued onto pages can do something similar. > Try and keep migration/ architecture independent. Yes, it should be arch independent and handled with a more abstract way. > > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > + /* Buffer for the page tags(one byte per tag) */ > > + tag_buf = g_try_malloc0(size); > > It feels like you want to allocate tag_buf in setup and free it at the > end rather than doing this in every page. Yes, the tag buffer allocation could be moved to migration save/load setup and cleanup function. > > > + if (!tag_buf) { > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > + return 0; > > + } > > + > > + if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > + error_report("%s: Can't get MTE tags from guest", __func__); > > For any error like this you probably want to say the addresses to make > it easier to debug when it fails. > Good suggestion! Will add the "address" info to the log. > > + g_free(tag_buf); > > + return 0; > > + } > > + > > + qemu_put_buffer(f, tag_buf, size); > > + > > + g_free(tag_buf); > > + > > + return size; > > + } > > + > > + return 0; > > +} > > + > > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > +{ > > + uint8_t *tag_buf = NULL; > > + uint64_t ipa; > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > + > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > + /* Buffer for the page tags(one byte per tag) */ > > + tag_buf = g_try_malloc0(size); > > + if (!tag_buf) { > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > + return; > > + } > > + > > + qemu_get_buffer(f, tag_buf, size); > > + if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > what protections are there here to stop you setting the mte on something > useful, like part of the host kernel or qemu? Kernel would do some parameter check(e.t ipa within a valid slot) before setting the tag data. > > > + error_report("%s: Can't set MTE tags to guest", __func__); > > + } > > + > > + g_free(tag_buf); > > + } > > + > > + return; > > +} > > + > > /* > > * directly send the page to the stream > > * > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > uint8_t *buf, bool async) > > { > > + if (rs->metadata_enabled) { > > + offset |= RAM_SAVE_FLAG_MTE; > > + } > > + > > ram_counters.transferred += save_page_header(rs, rs->f, block, > > offset | RAM_SAVE_FLAG_PAGE); > > if (async) { > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > } > > ram_counters.transferred += TARGET_PAGE_SIZE; > > ram_counters.normal++; > > + > > + if (rs->metadata_enabled) { > > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > > + } > > + > > return 1; > > } > > > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > > rs->last_version = ram_list.version; > > rs->ram_bulk_stage = true; > > rs->fpo_enabled = false; > > + rs->metadata_enabled = false; > > } > > > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > > } > > > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { > > case RAM_SAVE_FLAG_MEM_SIZE: > > /* Synchronize RAM block list */ > > total_ram_bytes = addr; > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > > > case RAM_SAVE_FLAG_PAGE: > > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > > + if (flags & RAM_SAVE_FLAG_MTE) { > > + load_normal_page_mte_tags(f, host); > > + } > > break; > > > > case RAM_SAVE_FLAG_COMPRESS_PAGE: > > -- > > 2.17.1 > > > -- > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK >
Haibo Xu <haibo.xu@linaro.org> wrote: > To make it easier to keep the page tags sync with > the page data, tags for one page are appended to > the data during ram save iteration. > > This patch only add the pre-copy migration support. > Post-copy and compress as well as zero page saving > are not supported yet. > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> > #define RAM_SAVE_FLAG_XBZRLE 0x40 > /* 0x80 is reserved in migration.h start with 0x100 next */ > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > +#define RAM_SAVE_FLAG_MTE 0x200 Flags are really a scarce resource. You are using one here, when you know that you will always have the feature enable (or not), so you can do better during negotiation IMHO. > +void precopy_enable_metadata_migration(void) > +{ > + if (!ram_state) { > + return; > + } > + > + ram_state->metadata_enabled = true; > +} My understanding is that in your following patch, if mte is enabled, you will always sent mte tags, for all pages needed, right? > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > +{ > + uint8_t *tag_buf = NULL; > + uint64_t ipa; > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > + > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > + /* Buffer for the page tags(one byte per tag) */ > + tag_buf = g_try_malloc0(size); size of the buffer is known at start of migration. Just get a buffer and reuse it? Do zero pages have mte tags? From migration point of view, a zero page is a page that is just full of zeros, i.e. nothing else special. Because you are not sending any for them. > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > uint8_t *buf, bool async) > { > + if (rs->metadata_enabled) { > + offset |= RAM_SAVE_FLAG_MTE; You don't really need the flag, for you normal pages are just TARGET_PAGE_SIZE + (TARGET_PAGE_SIZE/MTE_) > + } > + > ram_counters.transferred += save_page_header(rs, rs->f, block, > offset | RAM_SAVE_FLAG_PAGE); > if (async) { > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > } > ram_counters.transferred += TARGET_PAGE_SIZE; > ram_counters.normal++; > + > + if (rs->metadata_enabled) { See? You are not checking the flag, you are checking the bool setup at the beggining of migration. > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > + } > + > return 1; > } > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > rs->last_version = ram_list.version; > rs->ram_bulk_stage = true; > rs->fpo_enabled = false; > + rs->metadata_enabled = false; > } > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > } > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { Creating the flag is hurting you here also. > case RAM_SAVE_FLAG_MEM_SIZE: > /* Synchronize RAM block list */ > total_ram_bytes = addr; > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > case RAM_SAVE_FLAG_PAGE: > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > + if (flags & RAM_SAVE_FLAG_MTE) { > + load_normal_page_mte_tags(f, host); > + } I don't claim to understand the MTE, but my understanding is that if we are using MTE, all pages have to have MTE flags, right? So, somtehing like is_mte_enabled() that I told in the other thread looks like a good idea. Later, Juan. > break; > > case RAM_SAVE_FLAG_COMPRESS_PAGE:
* Haibo Xu (haibo.xu@linaro.org) wrote: > On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert > <dgilbert@redhat.com> wrote: > > > > * Haibo Xu (haibo.xu@linaro.org) wrote: > > > To make it easier to keep the page tags sync with > > > the page data, tags for one page are appended to > > > the data during ram save iteration. > > > > > > This patch only add the pre-copy migration support. > > > Post-copy and compress as well as zero page saving > > > are not supported yet. > > > > > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> > > > > My guess is that this doesn't work with a lot of other options; e.g. > > postcopy and probably compression and a bunch of other things. > > Postcopy I can see you'll need some interesting kernel changes for - > > you'd need to be able to atomically place a page with it's tag data. > > > > You probably need to add stuff to migrate_caps_check to disable > > features that you don't support. > > > > Hi David, > > Thanks so much for the comments! > > You are right, this RFC patch only supports pre-copy mode, no > postcopy, no compression. > As a RFC, here just want to finalize the tag migration process, that is: > 1. let the tag go with the page data(the current choice) which may be > a little complex to put > them into the current migration process. I think it's probably the easiest. > 2. migrate them separately which is easy to implement with the current > migration(treat the tags > as device status), but it would be hard to keep the page data and > tag to sync with each other. > 3. Any other ways? > > Once the tag migration process is finalized, a new formal patch series > with postcopy as well as > compression should be reworked. It's probably best to look what you would have to do to fit it in with those before finalising the basic version; not necessary implement them - just get an idea. > What's more, you mentioned that "some interesting kernel changes are > needed to atomically > place a page with it's tag data". You mean a single kernel API to > store page data and tag in > the migration load process? Yes; see migration/postcopy-ram.c postcopy_place_page - it has the problem of placing the whole page in memory before the guest starts usign it; I think for MTE you're going to have to put the MTE data in at the same time. Dave > > Regards, > Haibo > > > > --- > > > include/hw/arm/virt.h | 2 + > > > include/migration/misc.h | 1 + > > > migration/ram.c | 86 +++++++++++++++++++++++++++++++++++++++- > > > 3 files changed, 88 insertions(+), 1 deletion(-) > > > > > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h > > > index 921416f918..8b28cde8bf 100644 > > > --- a/include/hw/arm/virt.h > > > +++ b/include/hw/arm/virt.h > > > @@ -166,6 +166,8 @@ struct VirtMachineState { > > > PCIBus *bus; > > > char *oem_id; > > > char *oem_table_id; > > > + /* migrate memory tags */ > > > + NotifierWithReturn precopy_notifier; > > > }; > > > > > > #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) > > > diff --git a/include/migration/misc.h b/include/migration/misc.h > > > index bccc1b6b44..005133f471 100644 > > > --- a/include/migration/misc.h > > > +++ b/include/migration/misc.h > > > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n); > > > void precopy_remove_notifier(NotifierWithReturn *n); > > > int precopy_notify(PrecopyNotifyReason reason, Error **errp); > > > void precopy_enable_free_page_optimization(void); > > > +void precopy_enable_metadata_migration(void); > > > > > > void ram_mig_init(void); > > > void qemu_guest_free_page_hint(void *addr, size_t len); > > > diff --git a/migration/ram.c b/migration/ram.c > > > index 72143da0ac..e67b798c3b 100644 > > > --- a/migration/ram.c > > > +++ b/migration/ram.c > > > @@ -53,10 +53,12 @@ > > > #include "block.h" > > > #include "sysemu/sysemu.h" > > > #include "sysemu/cpu-throttle.h" > > > +#include "sysemu/kvm.h" > > > #include "savevm.h" > > > #include "qemu/iov.h" > > > #include "multifd.h" > > > #include "sysemu/runstate.h" > > > +#include "kvm_arm.h" > > > > > > #if defined(__linux__) > > > #include "qemu/userfaultfd.h" > > > @@ -80,6 +82,9 @@ > > > #define RAM_SAVE_FLAG_XBZRLE 0x40 > > > /* 0x80 is reserved in migration.h start with 0x100 next */ > > > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > > > +#define RAM_SAVE_FLAG_MTE 0x200 > > > > I think that's using the last free bit in the flags field, which is a > > bit painful. > > > > Yes, only 0x80(reserved) and 0x200 are available. > > > > +#define MTE_GRANULE_SIZE (16) > > > > > > static inline bool is_zero_range(uint8_t *p, uint64_t size) > > > { > > > @@ -317,6 +322,8 @@ struct RAMState { > > > bool ram_bulk_stage; > > > /* The free page optimization is enabled */ > > > bool fpo_enabled; > > > + /* The RAM meta data(e.t memory tag) is enabled */ > > > + bool metadata_enabled; > > > /* How many times we have dirty too many pages */ > > > int dirty_rate_high_cnt; > > > /* these variables are used for bitmap sync */ > > > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void) > > > ram_state->fpo_enabled = true; > > > } > > > > > > +void precopy_enable_metadata_migration(void) > > > +{ > > > + if (!ram_state) { > > > + return; > > > + } > > > + > > > + ram_state->metadata_enabled = true; > > > +} > > > + > > > uint64_t ram_bytes_remaining(void) > > > { > > > return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : > > > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > return true; > > > } > > > > > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > > +{ > > > + uint8_t *tag_buf = NULL; > > > + uint64_t ipa; > > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > > > This function needs to be mostly somewhere aarch specific; or somehow > > made more generic. > > We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something > > like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with > > something else glued onto pages can do something similar. > > Try and keep migration/ architecture independent. > > Yes, it should be arch independent and handled with a more abstract way. > > > > > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > > + /* Buffer for the page tags(one byte per tag) */ > > > + tag_buf = g_try_malloc0(size); > > > > It feels like you want to allocate tag_buf in setup and free it at the > > end rather than doing this in every page. > > Yes, the tag buffer allocation could be moved to migration save/load > setup and cleanup function. > > > > > > + if (!tag_buf) { > > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > > + return 0; > > > + } > > > + > > > + if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > > + error_report("%s: Can't get MTE tags from guest", __func__); > > > > For any error like this you probably want to say the addresses to make > > it easier to debug when it fails. > > > > Good suggestion! Will add the "address" info to the log. > > > > + g_free(tag_buf); > > > + return 0; > > > + } > > > + > > > + qemu_put_buffer(f, tag_buf, size); > > > + > > > + g_free(tag_buf); > > > + > > > + return size; > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > > +{ > > > + uint8_t *tag_buf = NULL; > > > + uint64_t ipa; > > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > > + > > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > > + /* Buffer for the page tags(one byte per tag) */ > > > + tag_buf = g_try_malloc0(size); > > > + if (!tag_buf) { > > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > > + return; > > > + } > > > + > > > + qemu_get_buffer(f, tag_buf, size); > > > + if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > > > what protections are there here to stop you setting the mte on something > > useful, like part of the host kernel or qemu? > > Kernel would do some parameter check(e.t ipa within a valid slot) > before setting the tag data. > > > > > > + error_report("%s: Can't set MTE tags to guest", __func__); > > > + } > > > + > > > + g_free(tag_buf); > > > + } > > > + > > > + return; > > > +} > > > + > > > /* > > > * directly send the page to the stream > > > * > > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > uint8_t *buf, bool async) > > > { > > > + if (rs->metadata_enabled) { > > > + offset |= RAM_SAVE_FLAG_MTE; > > > + } > > > + > > > ram_counters.transferred += save_page_header(rs, rs->f, block, > > > offset | RAM_SAVE_FLAG_PAGE); > > > if (async) { > > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > } > > > ram_counters.transferred += TARGET_PAGE_SIZE; > > > ram_counters.normal++; > > > + > > > + if (rs->metadata_enabled) { > > > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > > > + } > > > + > > > return 1; > > > } > > > > > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > > > rs->last_version = ram_list.version; > > > rs->ram_bulk_stage = true; > > > rs->fpo_enabled = false; > > > + rs->metadata_enabled = false; > > > } > > > > > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > > > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > > > } > > > > > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > > > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { > > > case RAM_SAVE_FLAG_MEM_SIZE: > > > /* Synchronize RAM block list */ > > > total_ram_bytes = addr; > > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > > > > > case RAM_SAVE_FLAG_PAGE: > > > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > > > + if (flags & RAM_SAVE_FLAG_MTE) { > > > + load_normal_page_mte_tags(f, host); > > > + } > > > break; > > > > > > case RAM_SAVE_FLAG_COMPRESS_PAGE: > > > -- > > > 2.17.1 > > > > > -- > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK > > >
On Thu, 25 Mar 2021 at 23:37, Juan Quintela <quintela@redhat.com> wrote: > > Haibo Xu <haibo.xu@linaro.org> wrote: > > To make it easier to keep the page tags sync with > > the page data, tags for one page are appended to > > the data during ram save iteration. > > > > This patch only add the pre-copy migration support. > > Post-copy and compress as well as zero page saving > > are not supported yet. > > > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> > > > > #define RAM_SAVE_FLAG_XBZRLE 0x40 > > /* 0x80 is reserved in migration.h start with 0x100 next */ > > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > > +#define RAM_SAVE_FLAG_MTE 0x200 > > Flags are really a scarce resource. You are using one here, when you > know that you will always have the feature enable (or not), so you can > do better during negotiation IMHO. > Yes, any suggestions are welcomed to finalize the MTE migration support! > > > +void precopy_enable_metadata_migration(void) > > +{ > > + if (!ram_state) { > > + return; > > + } > > + > > + ram_state->metadata_enabled = true; > > +} > > My understanding is that in your following patch, if mte is enabled, you > will always sent mte tags, for all pages needed, right? Yes, for the current kernel support, we can't tell whether the MTE was enabled for a specific page. > > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > +{ > > + uint8_t *tag_buf = NULL; > > + uint64_t ipa; > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > + > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > + /* Buffer for the page tags(one byte per tag) */ > > + tag_buf = g_try_malloc0(size); > > size of the buffer is known at start of migration. Just get a buffer > and reuse it? Yes, we can pre-allocate the buffer during the migration setup time > > Do zero pages have mte tags? From migration point of view, a zero page > is a page that is just full of zeros, i.e. nothing else special. > Because you are not sending any for them. > Yes, I think we can do some optimization for the zero page migration support. > > > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > uint8_t *buf, bool async) > > { > > + if (rs->metadata_enabled) { > > + offset |= RAM_SAVE_FLAG_MTE; > > You don't really need the flag, for you normal pages are just > TARGET_PAGE_SIZE + (TARGET_PAGE_SIZE/MTE_) > So you suggest to use the size field to indicate whether tags are available? > > > + } > > + > > ram_counters.transferred += save_page_header(rs, rs->f, block, > > offset | RAM_SAVE_FLAG_PAGE); > > if (async) { > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > } > > ram_counters.transferred += TARGET_PAGE_SIZE; > > ram_counters.normal++; > > + > > + if (rs->metadata_enabled) { > > See? You are not checking the flag, you are checking the bool setup at > the beggining of migration. The idea is to only migrate the memory tags when MTE was enabled for the VM. Could you be more elaborate on "You are not checking the flag"? > > > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > > + } > > + > > return 1; > > } > > > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > > rs->last_version = ram_list.version; > > rs->ram_bulk_stage = true; > > rs->fpo_enabled = false; > > + rs->metadata_enabled = false; > > } > > > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > > } > > > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { > > Creating the flag is hurting you here also. > > > case RAM_SAVE_FLAG_MEM_SIZE: > > /* Synchronize RAM block list */ > > total_ram_bytes = addr; > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > > > case RAM_SAVE_FLAG_PAGE: > > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > > + if (flags & RAM_SAVE_FLAG_MTE) { > > + load_normal_page_mte_tags(f, host); > > + } > > I don't claim to understand the MTE, but my understanding is that if we > are using MTE, all pages have to have MTE flags, right? > > So, somtehing like > > is_mte_enabled() > > that I told in the other thread looks like a good idea. > Yes, we do need a function like this. > Later, Juan. > > > break; > > > > case RAM_SAVE_FLAG_COMPRESS_PAGE: >
On Fri, 26 Mar 2021 at 03:38, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > * Haibo Xu (haibo.xu@linaro.org) wrote: > > On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert > > <dgilbert@redhat.com> wrote: > > > > > > * Haibo Xu (haibo.xu@linaro.org) wrote: > > > > To make it easier to keep the page tags sync with > > > > the page data, tags for one page are appended to > > > > the data during ram save iteration. > > > > > > > > This patch only add the pre-copy migration support. > > > > Post-copy and compress as well as zero page saving > > > > are not supported yet. > > > > > > > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org> > > > > > > My guess is that this doesn't work with a lot of other options; e.g. > > > postcopy and probably compression and a bunch of other things. > > > Postcopy I can see you'll need some interesting kernel changes for - > > > you'd need to be able to atomically place a page with it's tag data. > > > > > > You probably need to add stuff to migrate_caps_check to disable > > > features that you don't support. > > > > > > > Hi David, > > > > Thanks so much for the comments! > > > > You are right, this RFC patch only supports pre-copy mode, no > > postcopy, no compression. > > As a RFC, here just want to finalize the tag migration process, that is: > > 1. let the tag go with the page data(the current choice) which may be > > a little complex to put > > them into the current migration process. > > I think it's probably the easiest. > Agree! > > 2. migrate them separately which is easy to implement with the current > > migration(treat the tags > > as device status), but it would be hard to keep the page data and > > tag to sync with each other. > > 3. Any other ways? > > > > Once the tag migration process is finalized, a new formal patch series > > with postcopy as well as > > compression should be reworked. > > It's probably best to look what you would have to do to fit it in with > those before finalising the basic version; not necessary implement them > - just get an idea. > Thanks for the suggestion! > > What's more, you mentioned that "some interesting kernel changes are > > needed to atomically > > place a page with it's tag data". You mean a single kernel API to > > store page data and tag in > > the migration load process? > > Yes; see migration/postcopy-ram.c postcopy_place_page - it has the > problem of placing the whole page in memory before the guest starts > usign it; I think for MTE you're going to have to put the MTE data in at > the same time. > Good point! Will check this with the kernel guys. > Dave > > > > > Regards, > > Haibo > > > > > > --- > > > > include/hw/arm/virt.h | 2 + > > > > include/migration/misc.h | 1 + > > > > migration/ram.c | 86 +++++++++++++++++++++++++++++++++++++++- > > > > 3 files changed, 88 insertions(+), 1 deletion(-) > > > > > > > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h > > > > index 921416f918..8b28cde8bf 100644 > > > > --- a/include/hw/arm/virt.h > > > > +++ b/include/hw/arm/virt.h > > > > @@ -166,6 +166,8 @@ struct VirtMachineState { > > > > PCIBus *bus; > > > > char *oem_id; > > > > char *oem_table_id; > > > > + /* migrate memory tags */ > > > > + NotifierWithReturn precopy_notifier; > > > > }; > > > > > > > > #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) > > > > diff --git a/include/migration/misc.h b/include/migration/misc.h > > > > index bccc1b6b44..005133f471 100644 > > > > --- a/include/migration/misc.h > > > > +++ b/include/migration/misc.h > > > > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n); > > > > void precopy_remove_notifier(NotifierWithReturn *n); > > > > int precopy_notify(PrecopyNotifyReason reason, Error **errp); > > > > void precopy_enable_free_page_optimization(void); > > > > +void precopy_enable_metadata_migration(void); > > > > > > > > void ram_mig_init(void); > > > > void qemu_guest_free_page_hint(void *addr, size_t len); > > > > diff --git a/migration/ram.c b/migration/ram.c > > > > index 72143da0ac..e67b798c3b 100644 > > > > --- a/migration/ram.c > > > > +++ b/migration/ram.c > > > > @@ -53,10 +53,12 @@ > > > > #include "block.h" > > > > #include "sysemu/sysemu.h" > > > > #include "sysemu/cpu-throttle.h" > > > > +#include "sysemu/kvm.h" > > > > #include "savevm.h" > > > > #include "qemu/iov.h" > > > > #include "multifd.h" > > > > #include "sysemu/runstate.h" > > > > +#include "kvm_arm.h" > > > > > > > > #if defined(__linux__) > > > > #include "qemu/userfaultfd.h" > > > > @@ -80,6 +82,9 @@ > > > > #define RAM_SAVE_FLAG_XBZRLE 0x40 > > > > /* 0x80 is reserved in migration.h start with 0x100 next */ > > > > #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 > > > > +#define RAM_SAVE_FLAG_MTE 0x200 > > > > > > I think that's using the last free bit in the flags field, which is a > > > bit painful. > > > > > > > Yes, only 0x80(reserved) and 0x200 are available. > > > > > > +#define MTE_GRANULE_SIZE (16) > > > > > > > > static inline bool is_zero_range(uint8_t *p, uint64_t size) > > > > { > > > > @@ -317,6 +322,8 @@ struct RAMState { > > > > bool ram_bulk_stage; > > > > /* The free page optimization is enabled */ > > > > bool fpo_enabled; > > > > + /* The RAM meta data(e.t memory tag) is enabled */ > > > > + bool metadata_enabled; > > > > /* How many times we have dirty too many pages */ > > > > int dirty_rate_high_cnt; > > > > /* these variables are used for bitmap sync */ > > > > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void) > > > > ram_state->fpo_enabled = true; > > > > } > > > > > > > > +void precopy_enable_metadata_migration(void) > > > > +{ > > > > + if (!ram_state) { > > > > + return; > > > > + } > > > > + > > > > + ram_state->metadata_enabled = true; > > > > +} > > > > + > > > > uint64_t ram_bytes_remaining(void) > > > > { > > > > return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : > > > > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > > return true; > > > > } > > > > > > > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > > > +{ > > > > + uint8_t *tag_buf = NULL; > > > > + uint64_t ipa; > > > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > > > > > This function needs to be mostly somewhere aarch specific; or somehow > > > made more generic. > > > We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something > > > like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with > > > something else glued onto pages can do something similar. > > > Try and keep migration/ architecture independent. > > > > Yes, it should be arch independent and handled with a more abstract way. > > > > > > > > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > > > + /* Buffer for the page tags(one byte per tag) */ > > > > + tag_buf = g_try_malloc0(size); > > > > > > It feels like you want to allocate tag_buf in setup and free it at the > > > end rather than doing this in every page. > > > > Yes, the tag buffer allocation could be moved to migration save/load > > setup and cleanup function. > > > > > > > > > + if (!tag_buf) { > > > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > > > + return 0; > > > > + } > > > > + > > > > + if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > > > + error_report("%s: Can't get MTE tags from guest", __func__); > > > > > > For any error like this you probably want to say the addresses to make > > > it easier to debug when it fails. > > > > > > > Good suggestion! Will add the "address" info to the log. > > > > > > + g_free(tag_buf); > > > > + return 0; > > > > + } > > > > + > > > > + qemu_put_buffer(f, tag_buf, size); > > > > + > > > > + g_free(tag_buf); > > > > + > > > > + return size; > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) > > > > +{ > > > > + uint8_t *tag_buf = NULL; > > > > + uint64_t ipa; > > > > + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; > > > > + > > > > + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { > > > > + /* Buffer for the page tags(one byte per tag) */ > > > > + tag_buf = g_try_malloc0(size); > > > > + if (!tag_buf) { > > > > + error_report("%s: Error allocating MTE tag_buf", __func__); > > > > + return; > > > > + } > > > > + > > > > + qemu_get_buffer(f, tag_buf, size); > > > > + if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { > > > > > > what protections are there here to stop you setting the mte on something > > > useful, like part of the host kernel or qemu? > > > > Kernel would do some parameter check(e.t ipa within a valid slot) > > before setting the tag data. > > > > > > > > > + error_report("%s: Can't set MTE tags to guest", __func__); > > > > + } > > > > + > > > > + g_free(tag_buf); > > > > + } > > > > + > > > > + return; > > > > +} > > > > + > > > > /* > > > > * directly send the page to the stream > > > > * > > > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > > static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > > uint8_t *buf, bool async) > > > > { > > > > + if (rs->metadata_enabled) { > > > > + offset |= RAM_SAVE_FLAG_MTE; > > > > + } > > > > + > > > > ram_counters.transferred += save_page_header(rs, rs->f, block, > > > > offset | RAM_SAVE_FLAG_PAGE); > > > > if (async) { > > > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, > > > > } > > > > ram_counters.transferred += TARGET_PAGE_SIZE; > > > > ram_counters.normal++; > > > > + > > > > + if (rs->metadata_enabled) { > > > > + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); > > > > + } > > > > + > > > > return 1; > > > > } > > > > > > > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) > > > > rs->last_version = ram_list.version; > > > > rs->ram_bulk_stage = true; > > > > rs->fpo_enabled = false; > > > > + rs->metadata_enabled = false; > > > > } > > > > > > > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > > > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) > > > > trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); > > > > } > > > > > > > > - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { > > > > + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { > > > > case RAM_SAVE_FLAG_MEM_SIZE: > > > > /* Synchronize RAM block list */ > > > > total_ram_bytes = addr; > > > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) > > > > > > > > case RAM_SAVE_FLAG_PAGE: > > > > qemu_get_buffer(f, host, TARGET_PAGE_SIZE); > > > > + if (flags & RAM_SAVE_FLAG_MTE) { > > > > + load_normal_page_mte_tags(f, host); > > > > + } > > > > break; > > > > > > > > case RAM_SAVE_FLAG_COMPRESS_PAGE: > > > > -- > > > > 2.17.1 > > > > > > > -- > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK > > > > > > -- > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK >
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 921416f918..8b28cde8bf 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -166,6 +166,8 @@ struct VirtMachineState { PCIBus *bus; char *oem_id; char *oem_table_id; + /* migrate memory tags */ + NotifierWithReturn precopy_notifier; }; #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) diff --git a/include/migration/misc.h b/include/migration/misc.h index bccc1b6b44..005133f471 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n); void precopy_remove_notifier(NotifierWithReturn *n); int precopy_notify(PrecopyNotifyReason reason, Error **errp); void precopy_enable_free_page_optimization(void); +void precopy_enable_metadata_migration(void); void ram_mig_init(void); void qemu_guest_free_page_hint(void *addr, size_t len); diff --git a/migration/ram.c b/migration/ram.c index 72143da0ac..e67b798c3b 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -53,10 +53,12 @@ #include "block.h" #include "sysemu/sysemu.h" #include "sysemu/cpu-throttle.h" +#include "sysemu/kvm.h" #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" #include "sysemu/runstate.h" +#include "kvm_arm.h" #if defined(__linux__) #include "qemu/userfaultfd.h" @@ -80,6 +82,9 @@ #define RAM_SAVE_FLAG_XBZRLE 0x40 /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 +#define RAM_SAVE_FLAG_MTE 0x200 + +#define MTE_GRANULE_SIZE (16) static inline bool is_zero_range(uint8_t *p, uint64_t size) { @@ -317,6 +322,8 @@ struct RAMState { bool ram_bulk_stage; /* The free page optimization is enabled */ bool fpo_enabled; + /* The RAM meta data(e.t memory tag) is enabled */ + bool metadata_enabled; /* How many times we have dirty too many pages */ int dirty_rate_high_cnt; /* these variables are used for bitmap sync */ @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void) ram_state->fpo_enabled = true; } +void precopy_enable_metadata_migration(void) +{ + if (!ram_state) { + return; + } + + ram_state->metadata_enabled = true; +} + uint64_t ram_bytes_remaining(void) { return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, return true; } +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) +{ + uint8_t *tag_buf = NULL; + uint64_t ipa; + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; + + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { + /* Buffer for the page tags(one byte per tag) */ + tag_buf = g_try_malloc0(size); + if (!tag_buf) { + error_report("%s: Error allocating MTE tag_buf", __func__); + return 0; + } + + if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { + error_report("%s: Can't get MTE tags from guest", __func__); + g_free(tag_buf); + return 0; + } + + qemu_put_buffer(f, tag_buf, size); + + g_free(tag_buf); + + return size; + } + + return 0; +} + +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr) +{ + uint8_t *tag_buf = NULL; + uint64_t ipa; + int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE; + + if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) { + /* Buffer for the page tags(one byte per tag) */ + tag_buf = g_try_malloc0(size); + if (!tag_buf) { + error_report("%s: Error allocating MTE tag_buf", __func__); + return; + } + + qemu_get_buffer(f, tag_buf, size); + if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) { + error_report("%s: Can't set MTE tags to guest", __func__); + } + + g_free(tag_buf); + } + + return; +} + /* * directly send the page to the stream * @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, uint8_t *buf, bool async) { + if (rs->metadata_enabled) { + offset |= RAM_SAVE_FLAG_MTE; + } + ram_counters.transferred += save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE); if (async) { @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, } ram_counters.transferred += TARGET_PAGE_SIZE; ram_counters.normal++; + + if (rs->metadata_enabled) { + ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf); + } + return 1; } @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs) rs->last_version = ram_list.version; rs->ram_bulk_stage = true; rs->fpo_enabled = false; + rs->metadata_enabled = false; } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f) trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); } - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { + switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) { case RAM_SAVE_FLAG_MEM_SIZE: /* Synchronize RAM block list */ total_ram_bytes = addr; @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_PAGE: qemu_get_buffer(f, host, TARGET_PAGE_SIZE); + if (flags & RAM_SAVE_FLAG_MTE) { + load_normal_page_mte_tags(f, host); + } break; case RAM_SAVE_FLAG_COMPRESS_PAGE:
To make it easier to keep the page tags sync with the page data, tags for one page are appended to the data during ram save iteration. This patch only add the pre-copy migration support. Post-copy and compress as well as zero page saving are not supported yet. Signed-off-by: Haibo Xu <haibo.xu@linaro.org> --- include/hw/arm/virt.h | 2 + include/migration/misc.h | 1 + migration/ram.c | 86 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 1 deletion(-)