diff mbox series

[RFC] QEMU may write to system_memory before guest starts

Message ID 20190314110347.7533-1-yury-kotov@yandex-team.ru (mailing list archive)
State New, archived
Headers show
Series [RFC] QEMU may write to system_memory before guest starts | expand

Commit Message

Yury Kotov March 14, 2019, 11:03 a.m. UTC
This patch isn't intended to merge. Just to reproduce a problem.

The test for x-ignore-shread capability fails on aarch64 + tcg:
Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1

I expected that QEMU doesn't write to guest RAM until VM starts, but it
happens in this test. By this patch I found what causes this problem.

Backtrace:
0  0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
1  0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
6  0x000055f4a2c9851d in main () at vl.c:4552

To fix this particular we can skip these writes for the first system_reset
if -incoming is set. But I'm not sure how to fix this problem in general.
May be to introduce a contract which forbids to write to system_ram in such
case?

What do you think?

Signed-off-by: Yury Kotov <yury-kotov@yandex-team.ru>
---
 backends/hostmem-file.c   |  2 +-
 exec.c                    | 15 +++++++++++++--
 include/exec/cpu-common.h |  2 ++
 include/exec/memory.h     |  3 +++
 include/qemu/mmap-alloc.h |  2 +-
 migration/ram.c           |  2 ++
 util/mmap-alloc.c         |  6 ++++--
 util/oslib-posix.c        |  2 +-
 vl.c                      |  8 ++++++++
 9 files changed, 35 insertions(+), 7 deletions(-)

Comments

Jia He March 19, 2019, 7:35 a.m. UTC | #1
Thanks Yury.

On 2019/3/14 19:03, Yury Kotov wrote:
> This patch isn't intended to merge. Just to reproduce a problem.
>
> The test for x-ignore-shread capability fails on aarch64 + tcg:
> Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
> Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1
>
> I expected that QEMU doesn't write to guest RAM until VM starts, but it
> happens in this test. By this patch I found what causes this problem.
>
> Backtrace:
> 0  0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
> 1  0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
> 2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
> 3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
> 4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
> 5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
> 6  0x000055f4a2c9851d in main () at vl.c:4552

more than that

even in kvm mode, there is similar writing, and cause exception when 
destination VM is restoring.

This is the backtrace with your debugging patch

(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1  0x0000ffffbe7988b4 in __GI_abort () at abort.c:79
#2  0x0000aaaaaae41c88 in kvm_set_phys_mem (kml=0xaaaaabef7b88, 
section=0xffffffffe0c0, add=true) at 
/root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:821
#3  0x0000aaaaaae41d0c in kvm_region_add (listener=0xaaaaabef7b88, 
section=0xffffffffe0c0) at /root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:831
#4  0x0000aaaaaae22ca0 in address_space_update_topology_pass 
(as=0xaaaaabd16ce0 <address_space_memory>, old_view=0xaaaaabeec150, 
new_view=0xaaaaac03d3e0, adding=true)
     at /root/hj/q_migrate/qemu/memory.c:958
#5  0x0000aaaaaae22fc4 in address_space_set_flatview (as=0xaaaaabd16ce0 
<address_space_memory>) at /root/hj/q_migrate/qemu/memory.c:1034
#6  0x0000aaaaaae231e0 in memory_region_transaction_commit () at 
/root/hj/q_migrate/qemu/memory.c:1086
#7  0x0000aaaaaae26c90 in memory_region_update_container_subregions 
(subregion=0xaaaaabd81290) at /root/hj/q_migrate/qemu/memory.c:2358
#8  0x0000aaaaaae26d04 in memory_region_add_subregion_common 
(mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at 
/root/hj/q_migrate/qemu/memory.c:2368
#9  0x0000aaaaaae26d3c in memory_region_add_subregion 
(mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at 
/root/hj/q_migrate/qemu/memory.c:2376
#10 0x0000aaaaaaf63570 in machvirt_init (machine=0xaaaaabeb9c00) at 
/root/hj/q_migrate/qemu/hw/arm/virt.c:1611
#11 0x0000aaaaab1ff558 in machine_run_board_init 
(machine=0xaaaaabeb9c00) at hw/core/machine.c:965
#12 0x0000aaaaab14070c in main (argc=51, argv=0xffffffffee78, 
envp=0xfffffffff018) at vl.c:4459

---

Cheers,

Jia He

> To fix this particular we can skip these writes for the first system_reset
> if -incoming is set. But I'm not sure how to fix this problem in general.
> May be to introduce a contract which forbids to write to system_ram in such
> case?
>
> What do you think?
>
> Signed-off-by: Yury Kotov <yury-kotov@yandex-team.ru>
> ---
>   backends/hostmem-file.c   |  2 +-
>   exec.c                    | 15 +++++++++++++--
>   include/exec/cpu-common.h |  2 ++
>   include/exec/memory.h     |  3 +++
>   include/qemu/mmap-alloc.h |  2 +-
>   migration/ram.c           |  2 ++
>   util/mmap-alloc.c         |  6 ++++--
>   util/oslib-posix.c        |  2 +-
>   vl.c                      |  8 ++++++++
>   9 files changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index ce54788048..146fa2bc70 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -61,7 +61,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>       memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>                                        name,
>                                        backend->size, fb->align,
> -                                     (backend->share ? RAM_SHARED : 0) |
> +                                     (backend->share ? (RAM_SHARED | RAM_READONLY) : 0) |
>                                        (fb->is_pmem ? RAM_PMEM : 0),
>                                        fb->mem_path, errp);
>       g_free(name);
> diff --git a/exec.c b/exec.c
> index 1d4f3784d6..cd86e9b837 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1863,7 +1863,7 @@ static void *file_ram_alloc(RAMBlock *block,
>       }
>   
>       area = qemu_ram_mmap(fd, memory, block->mr->align,
> -                         block->flags & RAM_SHARED);
> +                         block->flags & RAM_SHARED, block->flags & RAM_READONLY);
>       if (area == MAP_FAILED) {
>           error_setg_errno(errp, errno,
>                            "unable to map backing store for guest RAM");
> @@ -2259,7 +2259,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>       int64_t file_size;
>   
>       /* Just support these ram flags by now. */
> -    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
> +    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_READONLY)) == 0);
>   
>       if (xen_enabled()) {
>           error_setg(errp, "-mem-path not supported with Xen");
> @@ -2485,6 +2485,17 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>           }
>       }
>   }
> +
> +void qemu_ram_unprotect_all(void)
> +{
> +    RAMBlock *block;
> +    rcu_read_lock();
> +    RAMBLOCK_FOREACH(block) {
> +        int ret = mprotect(block->host, block->max_length, PROT_READ | PROT_WRITE);
> +        assert(ret == 0);
> +    }
> +    rcu_read_unlock();
> +}
>   #endif /* !_WIN32 */
>   
>   /* Return a host pointer to ram allocated with qemu_ram_alloc.
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index cef8b88a2a..2ad875be06 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -63,6 +63,8 @@ typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value);
>   typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
>   
>   void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
> +void qemu_ram_unprotect_all(void);
> +
>   /* This should not be used by devices.  */
>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>   RAMBlock *qemu_ram_block_by_name(const char *name);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 1625913f84..b1cb5a48d7 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -126,6 +126,9 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>   /* RAM is a persistent kind memory */
>   #define RAM_PMEM (1 << 5)
>   
> +/* RAM is readonly*/
> +#define RAM_READONLY (1 << 6)
> +
>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                          IOMMUNotifierFlag flags,
>                                          hwaddr start, hwaddr end,
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index ef04f0ed5b..f704d9b7e3 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -7,7 +7,7 @@ size_t qemu_fd_getpagesize(int fd);
>   
>   size_t qemu_mempath_getpagesize(const char *mem_path);
>   
> -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared);
> +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly);
>   
>   void qemu_ram_munmap(int fd, void *ptr, size_t size);
>   
> diff --git a/migration/ram.c b/migration/ram.c
> index 35bd6213e9..252fc80e6c 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -4211,6 +4211,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>        */
>       rcu_read_lock();
>   
> +    qemu_ram_unprotect_all();
> +
>       if (postcopy_running) {
>           ret = ram_load_postcopy(f);
>       }
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 8565885420..7dc194d909 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -75,9 +75,10 @@ size_t qemu_mempath_getpagesize(const char *mem_path)
>       return getpagesize();
>   }
>   
> -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
> +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly)
>   {
>       int flags;
> +    int prots;
>       int guardfd;
>       size_t offset;
>       size_t pagesize;
> @@ -128,9 +129,10 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>       flags = MAP_FIXED;
>       flags |= fd == -1 ? MAP_ANONYMOUS : 0;
>       flags |= shared ? MAP_SHARED : MAP_PRIVATE;
> +    prots = PROT_READ | (readonly ? 0 : PROT_WRITE);
>       offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
>   
> -    ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
> +    ptr = mmap(guardptr + offset, size, prots, flags, fd, 0);
>   
>       if (ptr == MAP_FAILED) {
>           munmap(guardptr, total);
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 37c5854b9c..c57466f8d9 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -203,7 +203,7 @@ void *qemu_memalign(size_t alignment, size_t size)
>   void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
>   {
>       size_t align = QEMU_VMALLOC_ALIGN;
> -    void *ptr = qemu_ram_mmap(-1, size, align, shared);
> +    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
>   
>       if (ptr == MAP_FAILED) {
>           return NULL;
> diff --git a/vl.c b/vl.c
> index 4c5cc0d8ad..2cc675ad21 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2950,6 +2950,12 @@ static void register_global_properties(MachineState *ms)
>       user_register_global_props();
>   }
>   
> +static void unprotect_ram(void *opaque)
> +{
> +    error_report(__func__);
> +    qemu_ram_unprotect_all();
> +}
> +
>   int main(int argc, char **argv, char **envp)
>   {
>       int i;
> @@ -4537,6 +4543,8 @@ int main(int argc, char **argv, char **envp)
>   
>       replay_start();
>   
> +    qemu_register_reset(unprotect_ram, NULL);
> +
>       /* This checkpoint is required by replay to separate prior clock
>          reading from the other reads, because timer polling functions query
>          clock values from the log. */
Peter Maydell March 19, 2019, 9:27 a.m. UTC | #2
On Tue, 19 Mar 2019 at 07:36, Jia He <hejianet@gmail.com> wrote:
>
> Thanks Yury.
>
> On 2019/3/14 19:03, Yury Kotov wrote:
> > This patch isn't intended to merge. Just to reproduce a problem.
> >
> > The test for x-ignore-shread capability fails on aarch64 + tcg:
> > Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
> > Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1
> >
> > I expected that QEMU doesn't write to guest RAM until VM starts, but it
> > happens in this test. By this patch I found what causes this problem.
> >
> > Backtrace:
> > 0  0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
> > 1  0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
> > 2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
> > 3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
> > 4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
> > 5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
> > 6  0x000055f4a2c9851d in main () at vl.c:4552

This is expected -- as part of reset, we write the contents
of ROM blobs, ELF files loaded through -kernel, etc, to the
guest memory. It's not clear to me why that's causing a problem?

> more than that
>
> even in kvm mode, there is similar writing, and cause exception when
> destination VM is restoring.
>
> This is the backtrace with your debugging patch
>
> (gdb) bt
> #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
> #1  0x0000ffffbe7988b4 in __GI_abort () at abort.c:79
> #2  0x0000aaaaaae41c88 in kvm_set_phys_mem (kml=0xaaaaabef7b88,
> section=0xffffffffe0c0, add=true) at
> /root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:821
> #3  0x0000aaaaaae41d0c in kvm_region_add (listener=0xaaaaabef7b88,
> section=0xffffffffe0c0) at /root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:831
> #4  0x0000aaaaaae22ca0 in address_space_update_topology_pass
> (as=0xaaaaabd16ce0 <address_space_memory>, old_view=0xaaaaabeec150,
> new_view=0xaaaaac03d3e0, adding=true)
>      at /root/hj/q_migrate/qemu/memory.c:958
> #5  0x0000aaaaaae22fc4 in address_space_set_flatview (as=0xaaaaabd16ce0
> <address_space_memory>) at /root/hj/q_migrate/qemu/memory.c:1034
> #6  0x0000aaaaaae231e0 in memory_region_transaction_commit () at
> /root/hj/q_migrate/qemu/memory.c:1086
> #7  0x0000aaaaaae26c90 in memory_region_update_container_subregions
> (subregion=0xaaaaabd81290) at /root/hj/q_migrate/qemu/memory.c:2358
> #8  0x0000aaaaaae26d04 in memory_region_add_subregion_common
> (mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at
> /root/hj/q_migrate/qemu/memory.c:2368
> #9  0x0000aaaaaae26d3c in memory_region_add_subregion
> (mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at
> /root/hj/q_migrate/qemu/memory.c:2376
> #10 0x0000aaaaaaf63570 in machvirt_init (machine=0xaaaaabeb9c00) at
> /root/hj/q_migrate/qemu/hw/arm/virt.c:1611
> #11 0x0000aaaaab1ff558 in machine_run_board_init
> (machine=0xaaaaabeb9c00) at hw/core/machine.c:965
> #12 0x0000aaaaab14070c in main (argc=51, argv=0xffffffffee78,
> envp=0xfffffffff018) at vl.c:4459

This doesn't look like a write -- it's just the memory system
setting up the topology of the address spaces as the various
devices/ram/etc are created and put into their correct places.

thanks
-- PMM
Dr. David Alan Gilbert March 19, 2019, 9:39 a.m. UTC | #3
* Yury Kotov (yury-kotov@yandex-team.ru) wrote:
> This patch isn't intended to merge. Just to reproduce a problem.
> 
> The test for x-ignore-shread capability fails on aarch64 + tcg:
> Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
> Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1
> 
> I expected that QEMU doesn't write to guest RAM until VM starts, but it
> happens in this test. By this patch I found what causes this problem.
> 
> Backtrace:
> 0  0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
> 1  0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
> 2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
> 3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
> 4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
> 5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
> 6  0x000055f4a2c9851d in main () at vl.c:4552
> 
> To fix this particular we can skip these writes for the first system_reset
> if -incoming is set. But I'm not sure how to fix this problem in general.
> May be to introduce a contract which forbids to write to system_ram in such
> case?
> 
> What do you think?

I thought that ROMs would either:
   a) Be mapped shared from a file but then read-only and unwritten
or
   b) Be written to during boot - but this wouldn't be main memory, so
wouldn't be affected by your shared flag.

So which case is happening here? How are the ROMs being mapped - is this
actually a write to main memory or the RAMBlock backing just the ROM?

Dave


> Signed-off-by: Yury Kotov <yury-kotov@yandex-team.ru>
> ---
>  backends/hostmem-file.c   |  2 +-
>  exec.c                    | 15 +++++++++++++--
>  include/exec/cpu-common.h |  2 ++
>  include/exec/memory.h     |  3 +++
>  include/qemu/mmap-alloc.h |  2 +-
>  migration/ram.c           |  2 ++
>  util/mmap-alloc.c         |  6 ++++--
>  util/oslib-posix.c        |  2 +-
>  vl.c                      |  8 ++++++++
>  9 files changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index ce54788048..146fa2bc70 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -61,7 +61,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>      memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>                                       name,
>                                       backend->size, fb->align,
> -                                     (backend->share ? RAM_SHARED : 0) |
> +                                     (backend->share ? (RAM_SHARED | RAM_READONLY) : 0) |
>                                       (fb->is_pmem ? RAM_PMEM : 0),
>                                       fb->mem_path, errp);
>      g_free(name);
> diff --git a/exec.c b/exec.c
> index 1d4f3784d6..cd86e9b837 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1863,7 +1863,7 @@ static void *file_ram_alloc(RAMBlock *block,
>      }
>  
>      area = qemu_ram_mmap(fd, memory, block->mr->align,
> -                         block->flags & RAM_SHARED);
> +                         block->flags & RAM_SHARED, block->flags & RAM_READONLY);
>      if (area == MAP_FAILED) {
>          error_setg_errno(errp, errno,
>                           "unable to map backing store for guest RAM");
> @@ -2259,7 +2259,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>      int64_t file_size;
>  
>      /* Just support these ram flags by now. */
> -    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
> +    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_READONLY)) == 0);
>  
>      if (xen_enabled()) {
>          error_setg(errp, "-mem-path not supported with Xen");
> @@ -2485,6 +2485,17 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>          }
>      }
>  }
> +
> +void qemu_ram_unprotect_all(void)
> +{
> +    RAMBlock *block;
> +    rcu_read_lock();
> +    RAMBLOCK_FOREACH(block) {
> +        int ret = mprotect(block->host, block->max_length, PROT_READ | PROT_WRITE);
> +        assert(ret == 0);
> +    }
> +    rcu_read_unlock();
> +}
>  #endif /* !_WIN32 */
>  
>  /* Return a host pointer to ram allocated with qemu_ram_alloc.
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index cef8b88a2a..2ad875be06 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -63,6 +63,8 @@ typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value);
>  typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
>  
>  void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
> +void qemu_ram_unprotect_all(void);
> +
>  /* This should not be used by devices.  */
>  ram_addr_t qemu_ram_addr_from_host(void *ptr);
>  RAMBlock *qemu_ram_block_by_name(const char *name);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 1625913f84..b1cb5a48d7 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -126,6 +126,9 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>  /* RAM is a persistent kind memory */
>  #define RAM_PMEM (1 << 5)
>  
> +/* RAM is readonly*/
> +#define RAM_READONLY (1 << 6)
> +
>  static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                         IOMMUNotifierFlag flags,
>                                         hwaddr start, hwaddr end,
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index ef04f0ed5b..f704d9b7e3 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -7,7 +7,7 @@ size_t qemu_fd_getpagesize(int fd);
>  
>  size_t qemu_mempath_getpagesize(const char *mem_path);
>  
> -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared);
> +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly);
>  
>  void qemu_ram_munmap(int fd, void *ptr, size_t size);
>  
> diff --git a/migration/ram.c b/migration/ram.c
> index 35bd6213e9..252fc80e6c 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -4211,6 +4211,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>       */
>      rcu_read_lock();
>  
> +    qemu_ram_unprotect_all();
> +
>      if (postcopy_running) {
>          ret = ram_load_postcopy(f);
>      }
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 8565885420..7dc194d909 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -75,9 +75,10 @@ size_t qemu_mempath_getpagesize(const char *mem_path)
>      return getpagesize();
>  }
>  
> -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
> +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly)
>  {
>      int flags;
> +    int prots;
>      int guardfd;
>      size_t offset;
>      size_t pagesize;
> @@ -128,9 +129,10 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>      flags = MAP_FIXED;
>      flags |= fd == -1 ? MAP_ANONYMOUS : 0;
>      flags |= shared ? MAP_SHARED : MAP_PRIVATE;
> +    prots = PROT_READ | (readonly ? 0 : PROT_WRITE);
>      offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
>  
> -    ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
> +    ptr = mmap(guardptr + offset, size, prots, flags, fd, 0);
>  
>      if (ptr == MAP_FAILED) {
>          munmap(guardptr, total);
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 37c5854b9c..c57466f8d9 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -203,7 +203,7 @@ void *qemu_memalign(size_t alignment, size_t size)
>  void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
>  {
>      size_t align = QEMU_VMALLOC_ALIGN;
> -    void *ptr = qemu_ram_mmap(-1, size, align, shared);
> +    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
>  
>      if (ptr == MAP_FAILED) {
>          return NULL;
> diff --git a/vl.c b/vl.c
> index 4c5cc0d8ad..2cc675ad21 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2950,6 +2950,12 @@ static void register_global_properties(MachineState *ms)
>      user_register_global_props();
>  }
>  
> +static void unprotect_ram(void *opaque)
> +{
> +    error_report(__func__);
> +    qemu_ram_unprotect_all();
> +}
> +
>  int main(int argc, char **argv, char **envp)
>  {
>      int i;
> @@ -4537,6 +4543,8 @@ int main(int argc, char **argv, char **envp)
>  
>      replay_start();
>  
> +    qemu_register_reset(unprotect_ram, NULL);
> +
>      /* This checkpoint is required by replay to separate prior clock
>         reading from the other reads, because timer polling functions query
>         clock values from the log. */
> -- 
> 2.21.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 19, 2019, 10 a.m. UTC | #4
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On Tue, 19 Mar 2019 at 07:36, Jia He <hejianet@gmail.com> wrote:
> >
> > Thanks Yury.
> >
> > On 2019/3/14 19:03, Yury Kotov wrote:
> > > This patch isn't intended to merge. Just to reproduce a problem.
> > >
> > > The test for x-ignore-shread capability fails on aarch64 + tcg:
> > > Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
> > > Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1
> > >
> > > I expected that QEMU doesn't write to guest RAM until VM starts, but it
> > > happens in this test. By this patch I found what causes this problem.
> > >
> > > Backtrace:
> > > 0  0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
> > > 1  0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
> > > 2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
> > > 3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
> > > 4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
> > > 5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
> > > 6  0x000055f4a2c9851d in main () at vl.c:4552
> 
> This is expected -- as part of reset, we write the contents
> of ROM blobs, ELF files loaded through -kernel, etc, to the
> guest memory. It's not clear to me why that's causing a problem?

It's related to using a shared mapping for the memory , which in Yury's
migration case is shared RAM on the same system from source/destination.
That means the destination qemu as it starts overwrites the same memory
the source is currently running out of.

Dave

> > more than that
> >
> > even in kvm mode, there is similar writing, and cause exception when
> > destination VM is restoring.
> >
> > This is the backtrace with your debugging patch
> >
> > (gdb) bt
> > #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
> > #1  0x0000ffffbe7988b4 in __GI_abort () at abort.c:79
> > #2  0x0000aaaaaae41c88 in kvm_set_phys_mem (kml=0xaaaaabef7b88,
> > section=0xffffffffe0c0, add=true) at
> > /root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:821
> > #3  0x0000aaaaaae41d0c in kvm_region_add (listener=0xaaaaabef7b88,
> > section=0xffffffffe0c0) at /root/hj/q_migrate/qemu/accel/kvm/kvm-all.c:831
> > #4  0x0000aaaaaae22ca0 in address_space_update_topology_pass
> > (as=0xaaaaabd16ce0 <address_space_memory>, old_view=0xaaaaabeec150,
> > new_view=0xaaaaac03d3e0, adding=true)
> >      at /root/hj/q_migrate/qemu/memory.c:958
> > #5  0x0000aaaaaae22fc4 in address_space_set_flatview (as=0xaaaaabd16ce0
> > <address_space_memory>) at /root/hj/q_migrate/qemu/memory.c:1034
> > #6  0x0000aaaaaae231e0 in memory_region_transaction_commit () at
> > /root/hj/q_migrate/qemu/memory.c:1086
> > #7  0x0000aaaaaae26c90 in memory_region_update_container_subregions
> > (subregion=0xaaaaabd81290) at /root/hj/q_migrate/qemu/memory.c:2358
> > #8  0x0000aaaaaae26d04 in memory_region_add_subregion_common
> > (mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at
> > /root/hj/q_migrate/qemu/memory.c:2368
> > #9  0x0000aaaaaae26d3c in memory_region_add_subregion
> > (mr=0xaaaaabe43000, offset=1073741824, subregion=0xaaaaabd81290) at
> > /root/hj/q_migrate/qemu/memory.c:2376
> > #10 0x0000aaaaaaf63570 in machvirt_init (machine=0xaaaaabeb9c00) at
> > /root/hj/q_migrate/qemu/hw/arm/virt.c:1611
> > #11 0x0000aaaaab1ff558 in machine_run_board_init
> > (machine=0xaaaaabeb9c00) at hw/core/machine.c:965
> > #12 0x0000aaaaab14070c in main (argc=51, argv=0xffffffffee78,
> > envp=0xfffffffff018) at vl.c:4459
> 
> This doesn't look like a write -- it's just the memory system
> setting up the topology of the address spaces as the various
> devices/ram/etc are created and put into their correct places.
> 
> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell March 19, 2019, 10:48 a.m. UTC | #5
On Tue, 19 Mar 2019 at 09:40, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> I thought that ROMs would either:
>    a) Be mapped shared from a file but then read-only and unwritten

I don't think we support this at all, do we?

> or
>    b) Be written to during boot - but this wouldn't be main memory, so
> wouldn't be affected by your shared flag.

The rom code in hw/core/loader.c has two basic cases:
 (1) file is being loaded to something that is really a pure ROM:
in this case, on first reset we will write it to the backing RAMBlock
but we will not bother to do so in future
 (2) file is being loaded to something that is not a pure ROM:
this could be either real RAM (eg if a file is loaded via -kernel
or the 'generic loader' device), or a flash device, for instance.
In this case we will reload the backing RAMBlock with the file
contents on every reset, because the guest might have changed its
contents.
(There's also "file is provided via the fw_cfg device" but that's
not relevant here.)

I didn't think migration distinguished between "main memory"
and any other kind of RAMBlock-backed memory ?

thanks
-- PMM
Yury Kotov March 19, 2019, 10:50 a.m. UTC | #6
19.03.2019, 12:39, "Dr. David Alan Gilbert" <dgilbert@redhat.com>:
> * Yury Kotov (yury-kotov@yandex-team.ru) wrote:
>>  This patch isn't intended to merge. Just to reproduce a problem.
>>
>>  The test for x-ignore-shread capability fails on aarch64 + tcg:
>>  Memory content inconsistency at 44c00000 first_byte = 2 last_byte = 1 current = d1 hit_edge = 1
>>  Memory content inconsistency at 44c01000 first_byte = 2 last_byte = 1 current = 77 hit_edge = 1
>>
>>  I expected that QEMU doesn't write to guest RAM until VM starts, but it
>>  happens in this test. By this patch I found what causes this problem.
>>
>>  Backtrace:
>>  0 0x00007fb9e40affea in __memcpy_avx_unaligned () at ../sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S:118
>>  1 0x000055f4a296dd84 in address_space_write_rom_internal () at exec.c:3458
>>  2 0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
>>  3 0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
>>  4 0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
>>  5 0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
>>  6 0x000055f4a2c9851d in main () at vl.c:4552
>>
>>  To fix this particular we can skip these writes for the first system_reset
>>  if -incoming is set. But I'm not sure how to fix this problem in general.
>>  May be to introduce a contract which forbids to write to system_ram in such
>>  case?
>>
>>  What do you think?
>
> I thought that ROMs would either:
>    a) Be mapped shared from a file but then read-only and unwritten
> or
>    b) Be written to during boot - but this wouldn't be main memory, so
> wouldn't be affected by your shared flag.
>
> So which case is happening here? How are the ROMs being mapped - is this
> actually a write to main memory or the RAMBlock backing just the ROM?

It writes to the main memory. Just what I wanted to catch. For separated
RAMBlocks it works fine because separated blocks aren't shared and as a result
these aren't readonly.

P.S. For x86 qtests with this patch are passed.

Regards,
Yury

>
> Dave
>
>>  Signed-off-by: Yury Kotov <yury-kotov@yandex-team.ru>
>>  ---
>>   backends/hostmem-file.c | 2 +-
>>   exec.c | 15 +++++++++++++--
>>   include/exec/cpu-common.h | 2 ++
>>   include/exec/memory.h | 3 +++
>>   include/qemu/mmap-alloc.h | 2 +-
>>   migration/ram.c | 2 ++
>>   util/mmap-alloc.c | 6 ++++--
>>   util/oslib-posix.c | 2 +-
>>   vl.c | 8 ++++++++
>>   9 files changed, 35 insertions(+), 7 deletions(-)
>>
>>  diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
>>  index ce54788048..146fa2bc70 100644
>>  --- a/backends/hostmem-file.c
>>  +++ b/backends/hostmem-file.c
>>  @@ -61,7 +61,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>>       memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>>                                        name,
>>                                        backend->size, fb->align,
>>  - (backend->share ? RAM_SHARED : 0) |
>>  + (backend->share ? (RAM_SHARED | RAM_READONLY) : 0) |
>>                                        (fb->is_pmem ? RAM_PMEM : 0),
>>                                        fb->mem_path, errp);
>>       g_free(name);
>>  diff --git a/exec.c b/exec.c
>>  index 1d4f3784d6..cd86e9b837 100644
>>  --- a/exec.c
>>  +++ b/exec.c
>>  @@ -1863,7 +1863,7 @@ static void *file_ram_alloc(RAMBlock *block,
>>       }
>>
>>       area = qemu_ram_mmap(fd, memory, block->mr->align,
>>  - block->flags & RAM_SHARED);
>>  + block->flags & RAM_SHARED, block->flags & RAM_READONLY);
>>       if (area == MAP_FAILED) {
>>           error_setg_errno(errp, errno,
>>                            "unable to map backing store for guest RAM");
>>  @@ -2259,7 +2259,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>>       int64_t file_size;
>>
>>       /* Just support these ram flags by now. */
>>  - assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
>>  + assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_READONLY)) == 0);
>>
>>       if (xen_enabled()) {
>>           error_setg(errp, "-mem-path not supported with Xen");
>>  @@ -2485,6 +2485,17 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>>           }
>>       }
>>   }
>>  +
>>  +void qemu_ram_unprotect_all(void)
>>  +{
>>  + RAMBlock *block;
>>  + rcu_read_lock();
>>  + RAMBLOCK_FOREACH(block) {
>>  + int ret = mprotect(block->host, block->max_length, PROT_READ | PROT_WRITE);
>>  + assert(ret == 0);
>>  + }
>>  + rcu_read_unlock();
>>  +}
>>   #endif /* !_WIN32 */
>>
>>   /* Return a host pointer to ram allocated with qemu_ram_alloc.
>>  diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
>>  index cef8b88a2a..2ad875be06 100644
>>  --- a/include/exec/cpu-common.h
>>  +++ b/include/exec/cpu-common.h
>>  @@ -63,6 +63,8 @@ typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value);
>>   typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
>>
>>   void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
>>  +void qemu_ram_unprotect_all(void);
>>  +
>>   /* This should not be used by devices. */
>>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>>   RAMBlock *qemu_ram_block_by_name(const char *name);
>>  diff --git a/include/exec/memory.h b/include/exec/memory.h
>>  index 1625913f84..b1cb5a48d7 100644
>>  --- a/include/exec/memory.h
>>  +++ b/include/exec/memory.h
>>  @@ -126,6 +126,9 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>>   /* RAM is a persistent kind memory */
>>   #define RAM_PMEM (1 << 5)
>>
>>  +/* RAM is readonly*/
>>  +#define RAM_READONLY (1 << 6)
>>  +
>>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>>                                          IOMMUNotifierFlag flags,
>>                                          hwaddr start, hwaddr end,
>>  diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
>>  index ef04f0ed5b..f704d9b7e3 100644
>>  --- a/include/qemu/mmap-alloc.h
>>  +++ b/include/qemu/mmap-alloc.h
>>  @@ -7,7 +7,7 @@ size_t qemu_fd_getpagesize(int fd);
>>
>>   size_t qemu_mempath_getpagesize(const char *mem_path);
>>
>>  -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared);
>>  +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly);
>>
>>   void qemu_ram_munmap(int fd, void *ptr, size_t size);
>>
>>  diff --git a/migration/ram.c b/migration/ram.c
>>  index 35bd6213e9..252fc80e6c 100644
>>  --- a/migration/ram.c
>>  +++ b/migration/ram.c
>>  @@ -4211,6 +4211,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>        */
>>       rcu_read_lock();
>>
>>  + qemu_ram_unprotect_all();
>>  +
>>       if (postcopy_running) {
>>           ret = ram_load_postcopy(f);
>>       }
>>  diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
>>  index 8565885420..7dc194d909 100644
>>  --- a/util/mmap-alloc.c
>>  +++ b/util/mmap-alloc.c
>>  @@ -75,9 +75,10 @@ size_t qemu_mempath_getpagesize(const char *mem_path)
>>       return getpagesize();
>>   }
>>
>>  -void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>>  +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly)
>>   {
>>       int flags;
>>  + int prots;
>>       int guardfd;
>>       size_t offset;
>>       size_t pagesize;
>>  @@ -128,9 +129,10 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>>       flags = MAP_FIXED;
>>       flags |= fd == -1 ? MAP_ANONYMOUS : 0;
>>       flags |= shared ? MAP_SHARED : MAP_PRIVATE;
>>  + prots = PROT_READ | (readonly ? 0 : PROT_WRITE);
>>       offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
>>
>>  - ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
>>  + ptr = mmap(guardptr + offset, size, prots, flags, fd, 0);
>>
>>       if (ptr == MAP_FAILED) {
>>           munmap(guardptr, total);
>>  diff --git a/util/oslib-posix.c b/util/oslib-posix.c
>>  index 37c5854b9c..c57466f8d9 100644
>>  --- a/util/oslib-posix.c
>>  +++ b/util/oslib-posix.c
>>  @@ -203,7 +203,7 @@ void *qemu_memalign(size_t alignment, size_t size)
>>   void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
>>   {
>>       size_t align = QEMU_VMALLOC_ALIGN;
>>  - void *ptr = qemu_ram_mmap(-1, size, align, shared);
>>  + void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
>>
>>       if (ptr == MAP_FAILED) {
>>           return NULL;
>>  diff --git a/vl.c b/vl.c
>>  index 4c5cc0d8ad..2cc675ad21 100644
>>  --- a/vl.c
>>  +++ b/vl.c
>>  @@ -2950,6 +2950,12 @@ static void register_global_properties(MachineState *ms)
>>       user_register_global_props();
>>   }
>>
>>  +static void unprotect_ram(void *opaque)
>>  +{
>>  + error_report(__func__);
>>  + qemu_ram_unprotect_all();
>>  +}
>>  +
>>   int main(int argc, char **argv, char **envp)
>>   {
>>       int i;
>>  @@ -4537,6 +4543,8 @@ int main(int argc, char **argv, char **envp)
>>
>>       replay_start();
>>
>>  + qemu_register_reset(unprotect_ram, NULL);
>>  +
>>       /* This checkpoint is required by replay to separate prior clock
>>          reading from the other reads, because timer polling functions query
>>          clock values from the log. */
>>  --
>>  2.21.0
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 19, 2019, 11:03 a.m. UTC | #7
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On Tue, 19 Mar 2019 at 09:40, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> > I thought that ROMs would either:
> >    a) Be mapped shared from a file but then read-only and unwritten
> 
> I don't think we support this at all, do we?

OK, I thought we did somewhere but I might be dreaming it.

> > or
> >    b) Be written to during boot - but this wouldn't be main memory, so
> > wouldn't be affected by your shared flag.
> 
> The rom code in hw/core/loader.c has two basic cases:
>  (1) file is being loaded to something that is really a pure ROM:
> in this case, on first reset we will write it to the backing RAMBlock
> but we will not bother to do so in future
>  (2) file is being loaded to something that is not a pure ROM:
> this could be either real RAM (eg if a file is loaded via -kernel
> or the 'generic loader' device), or a flash device, for instance.
> In this case we will reload the backing RAMBlock with the file
> contents on every reset, because the guest might have changed its
> contents.
> (There's also "file is provided via the fw_cfg device" but that's
> not relevant here.)
> 
> I didn't think migration distinguished between "main memory"
> and any other kind of RAMBlock-backed memory ?

In Yury's case there's a distinction between RAMBlock's that are mapped
with RAM_SHARED (which normally ends up as MAP_SHARED) and all others.
You can set that for main memory by using -numa to specify a memdev
that's backed by a file and has the share=on property.

On x86 the ROMs end up as separate RAMBlock's that aren't affected
by that -numa/share=on - so they don't fight Yury's trick.

Dave



> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell March 19, 2019, 11:21 a.m. UTC | #8
On Tue, 19 Mar 2019 at 11:03, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
>
> * Peter Maydell (peter.maydell@linaro.org) wrote:
> > I didn't think migration distinguished between "main memory"
> > and any other kind of RAMBlock-backed memory ?
>
> In Yury's case there's a distinction between RAMBlock's that are mapped
> with RAM_SHARED (which normally ends up as MAP_SHARED) and all others.
> You can set that for main memory by using -numa to specify a memdev
> that's backed by a file and has the share=on property.
>
> On x86 the ROMs end up as separate RAMBlock's that aren't affected
> by that -numa/share=on - so they don't fight Yury's trick.

You can use the generic loader on x86 to load an ELF file
into RAM if you want, which would I think also trigger this.

thanks
-- PMM
Dr. David Alan Gilbert March 19, 2019, 11:52 a.m. UTC | #9
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On Tue, 19 Mar 2019 at 11:03, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> >
> > * Peter Maydell (peter.maydell@linaro.org) wrote:
> > > I didn't think migration distinguished between "main memory"
> > > and any other kind of RAMBlock-backed memory ?
> >
> > In Yury's case there's a distinction between RAMBlock's that are mapped
> > with RAM_SHARED (which normally ends up as MAP_SHARED) and all others.
> > You can set that for main memory by using -numa to specify a memdev
> > that's backed by a file and has the share=on property.
> >
> > On x86 the ROMs end up as separate RAMBlock's that aren't affected
> > by that -numa/share=on - so they don't fight Yury's trick.
> 
> You can use the generic loader on x86 to load an ELF file
> into RAM if you want, which would I think also trigger this.

OK, although that doesn't worry me too much  - since in the majority
of cases Yury's trick still works well.

I wonder if there's a way to make Yury's code to detect these cases
and not allow the feature;  the best thing for the moment would seem to
be to skip the aarch test that uses elf loading.

Dave

> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Igor Mammedov March 19, 2019, 12:42 p.m. UTC | #10
On Tue, 19 Mar 2019 11:52:45 +0000
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Peter Maydell (peter.maydell@linaro.org) wrote:
> > On Tue, 19 Mar 2019 at 11:03, Dr. David Alan Gilbert
> > <dgilbert@redhat.com> wrote:  
> > >
> > > * Peter Maydell (peter.maydell@linaro.org) wrote:  
> > > > I didn't think migration distinguished between "main memory"
> > > > and any other kind of RAMBlock-backed memory ?  
> > >
> > > In Yury's case there's a distinction between RAMBlock's that are mapped
> > > with RAM_SHARED (which normally ends up as MAP_SHARED) and all others.
> > > You can set that for main memory by using -numa to specify a memdev
> > > that's backed by a file and has the share=on property.
> > >
> > > On x86 the ROMs end up as separate RAMBlock's that aren't affected
> > > by that -numa/share=on - so they don't fight Yury's trick.  
> > 
> > You can use the generic loader on x86 to load an ELF file
> > into RAM if you want, which would I think also trigger this.  
> 
> OK, although that doesn't worry me too much  - since in the majority
> of cases Yury's trick still works well.
> 
> I wonder if there's a way to make Yury's code to detect these cases
> and not allow the feature;  the best thing for the moment would seem to
> be to skip the aarch test that uses elf loading.
on aarch(64) we also load dtb into main RAM currently only on boot,
and I was planning to move it to reset stage to accommodate
hotplug usecase so that guest would pick up hotplugged
devices after reboot.
If guest clobbers dtb area for any reason, then shared memory
usecase is currently broken since destination QEMU will
overwrite it on initial startup.


> Dave
> 
> > thanks
> > -- PMM  
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell March 19, 2019, 12:53 p.m. UTC | #11
On Tue, 19 Mar 2019 at 12:42, Igor Mammedov <imammedo@redhat.com> wrote:
> on aarch(64) we also load dtb into main RAM currently only on boot,

Hmm? We load the DTB in hw/arm/boot.c using rom_add_blob_fixed_as(),
which means that it will use this "rom blob loading" mechanism,
so it should be re-copied into RAM on reset.

> and I was planning to move it to reset stage to accommodate
> hotplug usecase so that guest would pick up hotplugged
> devices after reboot.

The problem here is that this is trying to change the
contents of the DTB blob on a reboot, which is not
supported by the ROM blob mechanism, which assumes that
these things are immutable.

thanks
-- PMM
Igor Mammedov March 19, 2019, 1:12 p.m. UTC | #12
On Tue, 19 Mar 2019 12:53:49 +0000
Peter Maydell <peter.maydell@linaro.org> wrote:

> On Tue, 19 Mar 2019 at 12:42, Igor Mammedov <imammedo@redhat.com> wrote:
> > on aarch(64) we also load dtb into main RAM currently only on boot,  
> 
> Hmm? We load the DTB in hw/arm/boot.c using rom_add_blob_fixed_as(),
> which means that it will use this "rom blob loading" mechanism,
> so it should be re-copied into RAM on reset.
yep, you are right I confused it with with mutable content issue
below.

> > and I was planning to move it to reset stage to accommodate
> > hotplug usecase so that guest would pick up hotplugged
> > devices after reboot.  
> 
> The problem here is that this is trying to change the
> contents of the DTB blob on a reboot, which is not
> supported by the ROM blob mechanism, which assumes that
> these things are immutable.
That was the problem I've actually met.

> 
> thanks
> -- PMM
Yury Kotov March 21, 2019, 4:27 p.m. UTC | #13
Hi,

19.03.2019, 14:52, "Dr. David Alan Gilbert" <dgilbert@redhat.com>:
> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>  On Tue, 19 Mar 2019 at 11:03, Dr. David Alan Gilbert
>>  <dgilbert@redhat.com> wrote:
>>  >
>>  > * Peter Maydell (peter.maydell@linaro.org) wrote:
>>  > > I didn't think migration distinguished between "main memory"
>>  > > and any other kind of RAMBlock-backed memory ?
>>  >
>>  > In Yury's case there's a distinction between RAMBlock's that are mapped
>>  > with RAM_SHARED (which normally ends up as MAP_SHARED) and all others.
>>  > You can set that for main memory by using -numa to specify a memdev
>>  > that's backed by a file and has the share=on property.
>>  >
>>  > On x86 the ROMs end up as separate RAMBlock's that aren't affected
>>  > by that -numa/share=on - so they don't fight Yury's trick.
>>
>>  You can use the generic loader on x86 to load an ELF file
>>  into RAM if you want, which would I think also trigger this.
>
> OK, although that doesn't worry me too much - since in the majority
> of cases Yury's trick still works well.
>
> I wonder if there's a way to make Yury's code to detect these cases
> and not allow the feature; the best thing for the moment would seem to
> be to skip the aarch test that uses elf loading.
>

Currently, I've no idea how to detect such cases, but there is an ability to
detect memory corruption. I want to update the RFC patch to let user to map some
memory regions as readonly until incoming migration start.

E.g.
1) If x-ignore-shared is enabled in command line or memory region is marked
   (something like ',readonly=on'),
2) Memory region is shared (,share=on),
3) And qemu is started with '-incoming' option

Then map such regions as readonly until incoming migration finished.
Thus, the patch will be able to detect memory corruption and will not affect
normal cases.

How do you think, is it needed?

I already have a cleaner version of the RFC patch, but I'm not sure about 1).
Which way is better: enable capability in command line, add a new option for
memory-backend or something else.

> Dave
>
>>  thanks
>>  -- PMM
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

Regards,
Yury
diff mbox series

Patch

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index ce54788048..146fa2bc70 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -61,7 +61,7 @@  file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
     memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
                                      name,
                                      backend->size, fb->align,
-                                     (backend->share ? RAM_SHARED : 0) |
+                                     (backend->share ? (RAM_SHARED | RAM_READONLY) : 0) |
                                      (fb->is_pmem ? RAM_PMEM : 0),
                                      fb->mem_path, errp);
     g_free(name);
diff --git a/exec.c b/exec.c
index 1d4f3784d6..cd86e9b837 100644
--- a/exec.c
+++ b/exec.c
@@ -1863,7 +1863,7 @@  static void *file_ram_alloc(RAMBlock *block,
     }
 
     area = qemu_ram_mmap(fd, memory, block->mr->align,
-                         block->flags & RAM_SHARED);
+                         block->flags & RAM_SHARED, block->flags & RAM_READONLY);
     if (area == MAP_FAILED) {
         error_setg_errno(errp, errno,
                          "unable to map backing store for guest RAM");
@@ -2259,7 +2259,7 @@  RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
     int64_t file_size;
 
     /* Just support these ram flags by now. */
-    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
+    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_READONLY)) == 0);
 
     if (xen_enabled()) {
         error_setg(errp, "-mem-path not supported with Xen");
@@ -2485,6 +2485,17 @@  void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
         }
     }
 }
+
+void qemu_ram_unprotect_all(void)
+{
+    RAMBlock *block;
+    rcu_read_lock();
+    RAMBLOCK_FOREACH(block) {
+        int ret = mprotect(block->host, block->max_length, PROT_READ | PROT_WRITE);
+        assert(ret == 0);
+    }
+    rcu_read_unlock();
+}
 #endif /* !_WIN32 */
 
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index cef8b88a2a..2ad875be06 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -63,6 +63,8 @@  typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value);
 typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
 
 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
+void qemu_ram_unprotect_all(void);
+
 /* This should not be used by devices.  */
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 RAMBlock *qemu_ram_block_by_name(const char *name);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 1625913f84..b1cb5a48d7 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -126,6 +126,9 @@  typedef struct IOMMUNotifier IOMMUNotifier;
 /* RAM is a persistent kind memory */
 #define RAM_PMEM (1 << 5)
 
+/* RAM is readonly*/
+#define RAM_READONLY (1 << 6)
+
 static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
                                        IOMMUNotifierFlag flags,
                                        hwaddr start, hwaddr end,
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index ef04f0ed5b..f704d9b7e3 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -7,7 +7,7 @@  size_t qemu_fd_getpagesize(int fd);
 
 size_t qemu_mempath_getpagesize(const char *mem_path);
 
-void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared);
+void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly);
 
 void qemu_ram_munmap(int fd, void *ptr, size_t size);
 
diff --git a/migration/ram.c b/migration/ram.c
index 35bd6213e9..252fc80e6c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4211,6 +4211,8 @@  static int ram_load(QEMUFile *f, void *opaque, int version_id)
      */
     rcu_read_lock();
 
+    qemu_ram_unprotect_all();
+
     if (postcopy_running) {
         ret = ram_load_postcopy(f);
     }
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 8565885420..7dc194d909 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -75,9 +75,10 @@  size_t qemu_mempath_getpagesize(const char *mem_path)
     return getpagesize();
 }
 
-void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
+void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared, bool readonly)
 {
     int flags;
+    int prots;
     int guardfd;
     size_t offset;
     size_t pagesize;
@@ -128,9 +129,10 @@  void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
     flags = MAP_FIXED;
     flags |= fd == -1 ? MAP_ANONYMOUS : 0;
     flags |= shared ? MAP_SHARED : MAP_PRIVATE;
+    prots = PROT_READ | (readonly ? 0 : PROT_WRITE);
     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
 
-    ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
+    ptr = mmap(guardptr + offset, size, prots, flags, fd, 0);
 
     if (ptr == MAP_FAILED) {
         munmap(guardptr, total);
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 37c5854b9c..c57466f8d9 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -203,7 +203,7 @@  void *qemu_memalign(size_t alignment, size_t size)
 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
 {
     size_t align = QEMU_VMALLOC_ALIGN;
-    void *ptr = qemu_ram_mmap(-1, size, align, shared);
+    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
 
     if (ptr == MAP_FAILED) {
         return NULL;
diff --git a/vl.c b/vl.c
index 4c5cc0d8ad..2cc675ad21 100644
--- a/vl.c
+++ b/vl.c
@@ -2950,6 +2950,12 @@  static void register_global_properties(MachineState *ms)
     user_register_global_props();
 }
 
+static void unprotect_ram(void *opaque)
+{
+    error_report(__func__);
+    qemu_ram_unprotect_all();
+}
+
 int main(int argc, char **argv, char **envp)
 {
     int i;
@@ -4537,6 +4543,8 @@  int main(int argc, char **argv, char **envp)
 
     replay_start();
 
+    qemu_register_reset(unprotect_ram, NULL);
+
     /* This checkpoint is required by replay to separate prior clock
        reading from the other reads, because timer polling functions query
        clock values from the log. */