Message ID | 20180827084037.25316-1-simon.hausmann@qt.io (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3] linux-user: add support for MADV_DONTNEED | expand |
Ping :) On 8/27/18 10:40 AM, Simon Hausmann wrote: > Most flags to madvise() are just hints, so typically ignoring the > syscall and returning okay is fine. However applications exist that do > rely on MADV_DONTNEED behavior to guarantee that upon subsequent access > the mapping is refreshed from the backing file or zero for anonymous > mappings. The file backed mappings this is tricky - hence the original > comment - but for anonymous mappings it is safe to forward. We just need > to remember which those mappings are, which is now stored in the flags. > > Cc: Riku Voipio <riku.voipio@iki.fi> > Cc: Laurent Vivier <laurent@vivier.eu> > Cc: Sami Nurmenniemi <sami.nurmenniemi@qt.io> > Signed-off-by: Simon Hausmann <simon.hausmann@qt.io> > > -- > v2: > - align start and len on host page size > v3: > - align start and len to target page size as it may be bigger than > host > - use immediate return in do_syscall > - forward MADV_DONTNEED advice only for anonymously mapped pages > - remember which mappings are anonymous in the page flags and preserve > those bits across mprotect calls. > --- > accel/tcg/translate-all.c | 8 +++++-- > bsd-user/mmap.c | 8 +++---- > include/exec/cpu-all.h | 11 ++++++++- > linux-user/elfload.c | 3 ++- > linux-user/mmap.c | 49 +++++++++++++++++++++++++++++++++++---- > linux-user/qemu.h | 1 + > linux-user/syscall.c | 12 ++++------ > 7 files changed, 72 insertions(+), 20 deletions(-) > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 898c3bb3d1..467fbd9aeb 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -2481,7 +2481,8 @@ int page_get_flags(target_ulong address) > /* Modify the flags of a page and invalidate the code if necessary. > The flag PAGE_WRITE_ORG is positioned automatically depending > on PAGE_WRITE. The mmap_lock should already be held. */ > -void page_set_flags(target_ulong start, target_ulong end, int flags) > +void page_set_flags(target_ulong start, target_ulong end, int flags, > + enum page_set_flags_mode mode) > { > target_ulong addr, len; > > @@ -2513,7 +2514,10 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) > p->first_tb) { > tb_invalidate_phys_page(addr, 0); > } > - p->flags = flags; > + if (mode == PAGE_SET_ALL_FLAGS) > + p->flags = flags; > + else /* PAGE_SET_PROTECTION */ > + p->flags |= (p->flags & ~(PAGE_BITS - 1)) | (flags & PAGE_BITS); > } > } > > diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c > index 17f4cd80aa..4bfac81af4 100644 > --- a/bsd-user/mmap.c > +++ b/bsd-user/mmap.c > @@ -125,7 +125,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) > if (ret != 0) > goto error; > } > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); > mmap_unlock(); > return 0; > error: > @@ -214,7 +214,7 @@ static abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size) > to mmap. */ > page_set_flags(last_brk & TARGET_PAGE_MASK, > TARGET_PAGE_ALIGN(new_brk), > - PAGE_RESERVED); > + PAGE_RESERVED, PAGE_SET_ALL_FLAGS); > } > last_brk = new_brk; > > @@ -397,7 +397,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, > } > } > the_end1: > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_set_flags(start, start + len, prot | PAGE_VALID, PAGE_SET_ALL_FLAGS); > the_end: > #ifdef DEBUG_MMAP > printf("ret=0x" TARGET_FMT_lx "\n", start); > @@ -460,7 +460,7 @@ int target_munmap(abi_ulong start, abi_ulong len) > } > > if (ret == 0) > - page_set_flags(start, start + len, 0); > + page_set_flags(start, start + len, 0, PAGE_SET_ALL_FLAGS); > mmap_unlock(); > return ret; > } > diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h > index 117d2fbbca..100388dcd4 100644 > --- a/include/exec/cpu-all.h > +++ b/include/exec/cpu-all.h > @@ -249,6 +249,9 @@ extern intptr_t qemu_host_page_mask; > /* FIXME: Code that sets/uses this is broken and needs to go away. */ > #define PAGE_RESERVED 0x0020 > #endif > +#if defined(CONFIG_LINUX) && defined(CONFIG_USER_ONLY) > +#define PAGE_MAP_ANONYMOUS 0x0080 > +#endif > > #if defined(CONFIG_USER_ONLY) > void page_dump(FILE *f); > @@ -257,8 +260,14 @@ typedef int (*walk_memory_regions_fn)(void *, target_ulong, > target_ulong, unsigned long); > int walk_memory_regions(void *, walk_memory_regions_fn); > > +enum page_set_flags_mode { > + PAGE_SET_ALL_FLAGS, > + PAGE_SET_PROTECTION > +}; > + > int page_get_flags(target_ulong address); > -void page_set_flags(target_ulong start, target_ulong end, int flags); > +void page_set_flags(target_ulong start, target_ulong end, int flags, > + enum page_set_flags_mode); > int page_check_range(target_ulong start, target_ulong len, int flags); > #endif > > diff --git a/linux-user/elfload.c b/linux-user/elfload.c > index 8638612aec..c59cf4359c 100644 > --- a/linux-user/elfload.c > +++ b/linux-user/elfload.c > @@ -1702,7 +1702,8 @@ static void zero_bss(abi_ulong elf_bss, abi_ulong last_bss, int prot) > > /* Ensure that the bss page(s) are valid */ > if ((page_get_flags(last_bss-1) & prot) != prot) { > - page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot | PAGE_VALID); > + page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot, > + PAGE_SET_PROTECTION); > } > > if (host_start < host_map_start) { > diff --git a/linux-user/mmap.c b/linux-user/mmap.c > index 41e0983ce8..fff29ee04b 100644 > --- a/linux-user/mmap.c > +++ b/linux-user/mmap.c > @@ -124,7 +124,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) > if (ret != 0) > goto error; > } > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); > mmap_unlock(); > return 0; > error: > @@ -362,6 +362,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, > int flags, int fd, abi_ulong offset) > { > abi_ulong ret, end, real_start, real_end, retaddr, host_offset, host_len; > + int page_flags; > > mmap_lock(); > #ifdef DEBUG_MMAP > @@ -562,7 +563,11 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, > } > } > the_end1: > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_flags = prot | PAGE_VALID; > + if (flags & MAP_ANONYMOUS) { > + page_flags |= PAGE_MAP_ANONYMOUS; > + } > + page_set_flags(start, start + len, page_flags, PAGE_SET_ALL_FLAGS); > the_end: > #ifdef DEBUG_MMAP > printf("ret=0x" TARGET_ABI_FMT_lx "\n", start); > @@ -675,7 +680,7 @@ int target_munmap(abi_ulong start, abi_ulong len) > } > > if (ret == 0) { > - page_set_flags(start, start + len, 0); > + page_set_flags(start, start + len, 0, PAGE_SET_ALL_FLAGS); > tb_invalidate_phys_range(start, start + len); > } > mmap_unlock(); > @@ -755,10 +760,44 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, > } else { > new_addr = h2g(host_addr); > prot = page_get_flags(old_addr); > - page_set_flags(old_addr, old_addr + old_size, 0); > - page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID); > + page_set_flags(old_addr, old_addr + old_size, 0, > + PAGE_SET_ALL_FLAGS); > + page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID, > + PAGE_SET_ALL_FLAGS); > } > tb_invalidate_phys_range(new_addr, new_addr + new_size); > mmap_unlock(); > return new_addr; > } > + > +int target_madvise(abi_ulong start, abi_ulong len, int advice) > +{ > + abi_ulong end, addr; > + int ret; > + > + len = TARGET_PAGE_ALIGN(len); > + start &= TARGET_PAGE_MASK; > + > + if (!guest_range_valid(start, len)) { > + errno = EINVAL; > + return -1; > + } > + > + /* A straight passthrough may not be safe because qemu sometimes > + turns private file-backed mappings into anonymous mappings. > + Most flags are hints so we ignore them. > + One exception is made for MADV_DONTNEED on anonymous mappings, > + that applications may rely on to zero out pages. */ > + if (advice & MADV_DONTNEED) { > + end = start + len; > + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { > + if (page_get_flags(addr) & PAGE_MAP_ANONYMOUS) { > + ret = madvise(g2h(addr), TARGET_PAGE_SIZE, MADV_DONTNEED); > + if (ret != 0) { > + return ret; > + } > + } > + } > + } > + return 0; > +} > diff --git a/linux-user/qemu.h b/linux-user/qemu.h > index b4959e41c6..e5ac5e9c6a 100644 > --- a/linux-user/qemu.h > +++ b/linux-user/qemu.h > @@ -437,6 +437,7 @@ int target_munmap(abi_ulong start, abi_ulong len); > abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, > abi_ulong new_size, unsigned long flags, > abi_ulong new_addr); > +int target_madvise(abi_ulong start, abi_ulong len, int advice); > extern unsigned long last_brk; > extern abi_ulong mmap_next_start; > abi_ulong mmap_find_vma(abi_ulong, abi_ulong); > diff --git a/linux-user/syscall.c b/linux-user/syscall.c > index 850b72a0c7..4478de5fc8 100644 > --- a/linux-user/syscall.c > +++ b/linux-user/syscall.c > @@ -5124,7 +5124,8 @@ static inline abi_ulong do_shmat(CPUArchState *cpu_env, > > page_set_flags(raddr, raddr + shm_info.shm_segsz, > PAGE_VALID | PAGE_READ | > - ((shmflg & SHM_RDONLY)? 0 : PAGE_WRITE)); > + ((shmflg & SHM_RDONLY) ? 0 : PAGE_WRITE), > + PAGE_SET_ALL_FLAGS); > > for (i = 0; i < N_SHM_REGIONS; i++) { > if (!shm_regions[i].in_use) { > @@ -5150,7 +5151,8 @@ static inline abi_long do_shmdt(abi_ulong shmaddr) > for (i = 0; i < N_SHM_REGIONS; ++i) { > if (shm_regions[i].in_use && shm_regions[i].start == shmaddr) { > shm_regions[i].in_use = false; > - page_set_flags(shmaddr, shmaddr + shm_regions[i].size, 0); > + page_set_flags(shmaddr, shmaddr + shm_regions[i].size, 0, > + PAGE_SET_ALL_FLAGS); > break; > } > } > @@ -11559,11 +11561,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1, > > #ifdef TARGET_NR_madvise > case TARGET_NR_madvise: > - /* A straight passthrough may not be safe because qemu sometimes > - turns private file-backed mappings into anonymous mappings. > - This will break MADV_DONTNEED. > - This is a hint, so ignoring and returning success is ok. */ > - return 0; > + return get_errno(target_madvise(arg1, arg2, arg3)); > #endif > #if TARGET_ABI_BITS == 32 > case TARGET_NR_fcntl64:
Le 27/08/2018 à 10:40, Simon Hausmann a écrit : > Most flags to madvise() are just hints, so typically ignoring the > syscall and returning okay is fine. However applications exist that do > rely on MADV_DONTNEED behavior to guarantee that upon subsequent access > the mapping is refreshed from the backing file or zero for anonymous > mappings. The file backed mappings this is tricky - hence the original > comment - but for anonymous mappings it is safe to forward. We just need > to remember which those mappings are, which is now stored in the flags. > > Cc: Riku Voipio <riku.voipio@iki.fi> > Cc: Laurent Vivier <laurent@vivier.eu> > Cc: Sami Nurmenniemi <sami.nurmenniemi@qt.io> > Signed-off-by: Simon Hausmann <simon.hausmann@qt.io> > > -- > v2: > - align start and len on host page size > v3: > - align start and len to target page size as it may be bigger than > host > - use immediate return in do_syscall > - forward MADV_DONTNEED advice only for anonymously mapped pages > - remember which mappings are anonymous in the page flags and preserve > those bits across mprotect calls. > --- > accel/tcg/translate-all.c | 8 +++++-- > bsd-user/mmap.c | 8 +++---- > include/exec/cpu-all.h | 11 ++++++++- > linux-user/elfload.c | 3 ++- > linux-user/mmap.c | 49 +++++++++++++++++++++++++++++++++++---- > linux-user/qemu.h | 1 + > linux-user/syscall.c | 12 ++++------ > 7 files changed, 72 insertions(+), 20 deletions(-) > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 898c3bb3d1..467fbd9aeb 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -2481,7 +2481,8 @@ int page_get_flags(target_ulong address) > /* Modify the flags of a page and invalidate the code if necessary. > The flag PAGE_WRITE_ORG is positioned automatically depending > on PAGE_WRITE. The mmap_lock should already be held. */ > -void page_set_flags(target_ulong start, target_ulong end, int flags) > +void page_set_flags(target_ulong start, target_ulong end, int flags, > + enum page_set_flags_mode mode) Is it possible to not add the mode, and only use "flags" to set the new flag? Using page_get_flags() to keep flags that needed to be kept? > { > target_ulong addr, len; > > @@ -2513,7 +2514,10 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) > p->first_tb) { > tb_invalidate_phys_page(addr, 0); > } > - p->flags = flags; > + if (mode == PAGE_SET_ALL_FLAGS) > + p->flags = flags; > + else /* PAGE_SET_PROTECTION */ > + p->flags |= (p->flags & ~(PAGE_BITS - 1)) | (flags & PAGE_BITS); > } > } > > diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c > index 17f4cd80aa..4bfac81af4 100644 > --- a/bsd-user/mmap.c > +++ b/bsd-user/mmap.c > @@ -125,7 +125,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) > if (ret != 0) > goto error; > } > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); Why do you remove the PAGE_VALID flag? ... > diff --git a/linux-user/elfload.c b/linux-user/elfload.c > index 8638612aec..c59cf4359c 100644 > --- a/linux-user/elfload.c > +++ b/linux-user/elfload.c > @@ -1702,7 +1702,8 @@ static void zero_bss(abi_ulong elf_bss, abi_ulong last_bss, int prot) > > /* Ensure that the bss page(s) are valid */ > if ((page_get_flags(last_bss-1) & prot) != prot) { > - page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot | PAGE_VALID); > + page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot, > + PAGE_SET_PROTECTION); > } PAGE_VALID? > > if (host_start < host_map_start) { > diff --git a/linux-user/mmap.c b/linux-user/mmap.c > index 41e0983ce8..fff29ee04b 100644 > --- a/linux-user/mmap.c > +++ b/linux-user/mmap.c > @@ -124,7 +124,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) > if (ret != 0) > goto error; > } > - page_set_flags(start, start + len, prot | PAGE_VALID); > + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); > mmap_unlock(); PAGE_VALID? ... > +int target_madvise(abi_ulong start, abi_ulong len, int advice) > +{ > + abi_ulong end, addr; > + int ret; > + > + len = TARGET_PAGE_ALIGN(len); > + start &= TARGET_PAGE_MASK; > + > + if (!guest_range_valid(start, len)) { > + errno = EINVAL; > + return -1; > + } > + > + /* A straight passthrough may not be safe because qemu sometimes > + turns private file-backed mappings into anonymous mappings. > + Most flags are hints so we ignore them. > + One exception is made for MADV_DONTNEED on anonymous mappings, > + that applications may rely on to zero out pages. */ > + if (advice & MADV_DONTNEED) { > + end = start + len; > + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { > + if (page_get_flags(addr) & PAGE_MAP_ANONYMOUS) { > + ret = madvise(g2h(addr), TARGET_PAGE_SIZE, MADV_DONTNEED); these values must be aligned on host page size. Thanks, Laurent
On 9/6/18 11:34 AM, Laurent Vivier wrote: > Le 27/08/2018 à 10:40, Simon Hausmann a écrit : >> Most flags to madvise() are just hints, so typically ignoring the >> syscall and returning okay is fine. However applications exist that do >> rely on MADV_DONTNEED behavior to guarantee that upon subsequent access >> the mapping is refreshed from the backing file or zero for anonymous >> mappings. The file backed mappings this is tricky - hence the original >> comment - but for anonymous mappings it is safe to forward. We just need >> to remember which those mappings are, which is now stored in the flags. >> >> Cc: Riku Voipio <riku.voipio@iki.fi> >> Cc: Laurent Vivier <laurent@vivier.eu> >> Cc: Sami Nurmenniemi <sami.nurmenniemi@qt.io> >> Signed-off-by: Simon Hausmann <simon.hausmann@qt.io> >> >> -- >> v2: >> - align start and len on host page size >> v3: >> - align start and len to target page size as it may be bigger than >> host >> - use immediate return in do_syscall >> - forward MADV_DONTNEED advice only for anonymously mapped pages >> - remember which mappings are anonymous in the page flags and preserve >> those bits across mprotect calls. >> --- >> accel/tcg/translate-all.c | 8 +++++-- >> bsd-user/mmap.c | 8 +++---- >> include/exec/cpu-all.h | 11 ++++++++- >> linux-user/elfload.c | 3 ++- >> linux-user/mmap.c | 49 +++++++++++++++++++++++++++++++++++---- >> linux-user/qemu.h | 1 + >> linux-user/syscall.c | 12 ++++------ >> 7 files changed, 72 insertions(+), 20 deletions(-) >> >> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c >> index 898c3bb3d1..467fbd9aeb 100644 >> --- a/accel/tcg/translate-all.c >> +++ b/accel/tcg/translate-all.c >> @@ -2481,7 +2481,8 @@ int page_get_flags(target_ulong address) >> /* Modify the flags of a page and invalidate the code if necessary. >> The flag PAGE_WRITE_ORG is positioned automatically depending >> on PAGE_WRITE. The mmap_lock should already be held. */ >> -void page_set_flags(target_ulong start, target_ulong end, int flags) >> +void page_set_flags(target_ulong start, target_ulong end, int flags, >> + enum page_set_flags_mode mode) > Is it possible to not add the mode, and only use "flags" to set the new > flag? Using page_get_flags() to keep flags that needed to be kept? Hm, the reason why I didn't do that is because page_set_flags operates on a range while page_get_flags() gets the flags for only one page. If you prefer not to have the mode, then I can see only two other options: (1) Assume that the flags are typically homogeneous and use something like page_set_flags(start, end, page_get_flags(start) | whatever) (2) Change the call sites to not use the range setting ability of page_set_flags but loop itself. I don't like either, so I went for the mode. But I can change to any way you prefer. >> { >> target_ulong addr, len; >> >> @@ -2513,7 +2514,10 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) >> p->first_tb) { >> tb_invalidate_phys_page(addr, 0); >> } >> - p->flags = flags; >> + if (mode == PAGE_SET_ALL_FLAGS) >> + p->flags = flags; >> + else /* PAGE_SET_PROTECTION */ >> + p->flags |= (p->flags & ~(PAGE_BITS - 1)) | (flags & PAGE_BITS); >> } >> } >> >> diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c >> index 17f4cd80aa..4bfac81af4 100644 >> --- a/bsd-user/mmap.c >> +++ b/bsd-user/mmap.c >> @@ -125,7 +125,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) >> if (ret != 0) >> goto error; >> } >> - page_set_flags(start, start + len, prot | PAGE_VALID); >> + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); > Why do you remove the PAGE_VALID flag? As the mode set PAGE_SET_PROTECTION, the page_set_flags call only affects the protection bits and the existing PAGE_VALID is preserved. I believe PAGE_VALID must be set for that range. If that assumption is wrong, then I could change page_set_flags to implicitly set PAGE_VALID if the mode is to apply new protection flags. Would that be ok? > ... >> diff --git a/linux-user/elfload.c b/linux-user/elfload.c >> index 8638612aec..c59cf4359c 100644 >> --- a/linux-user/elfload.c >> +++ b/linux-user/elfload.c >> @@ -1702,7 +1702,8 @@ static void zero_bss(abi_ulong elf_bss, abi_ulong last_bss, int prot) >> >> /* Ensure that the bss page(s) are valid */ >> if ((page_get_flags(last_bss-1) & prot) != prot) { >> - page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot | PAGE_VALID); >> + page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot, >> + PAGE_SET_PROTECTION); >> } > PAGE_VALID? Oh! You're right, here I mistakenly thought the pages are already marked valid, but it appears that this is a new mapping, so I'll fix that to set PAGE_VALID. >> >> if (host_start < host_map_start) { >> diff --git a/linux-user/mmap.c b/linux-user/mmap.c >> index 41e0983ce8..fff29ee04b 100644 >> --- a/linux-user/mmap.c >> +++ b/linux-user/mmap.c >> @@ -124,7 +124,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) >> if (ret != 0) >> goto error; >> } >> - page_set_flags(start, start + len, prot | PAGE_VALID); >> + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); >> mmap_unlock(); > PAGE_VALID? > > ... Same - potentially wrong - reasoning as with the other mprotect() wrapper. >> +int target_madvise(abi_ulong start, abi_ulong len, int advice) >> +{ >> + abi_ulong end, addr; >> + int ret; >> + >> + len = TARGET_PAGE_ALIGN(len); >> + start &= TARGET_PAGE_MASK; >> + >> + if (!guest_range_valid(start, len)) { >> + errno = EINVAL; >> + return -1; >> + } >> + >> + /* A straight passthrough may not be safe because qemu sometimes >> + turns private file-backed mappings into anonymous mappings. >> + Most flags are hints so we ignore them. >> + One exception is made for MADV_DONTNEED on anonymous mappings, >> + that applications may rely on to zero out pages. */ >> + if (advice & MADV_DONTNEED) { >> + end = start + len; >> + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { >> + if (page_get_flags(addr) & PAGE_MAP_ANONYMOUS) { >> + ret = madvise(g2h(addr), TARGET_PAGE_SIZE, MADV_DONTNEED); > these values must be aligned on host page size. > Sorry, I think I assumed from the earlier statement that the target page size may be bigger than the host and that that would be sufficient for host alignment. I didn't consider that the reverse may also apply. Would the following be correct? (1) Use target aligned start/len for guest_range_valid call (2) Use host aligned start/end/increment for the loop (including the page_get_flags call parameters). Thanks for the feedback :) Simon
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 898c3bb3d1..467fbd9aeb 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -2481,7 +2481,8 @@ int page_get_flags(target_ulong address) /* Modify the flags of a page and invalidate the code if necessary. The flag PAGE_WRITE_ORG is positioned automatically depending on PAGE_WRITE. The mmap_lock should already be held. */ -void page_set_flags(target_ulong start, target_ulong end, int flags) +void page_set_flags(target_ulong start, target_ulong end, int flags, + enum page_set_flags_mode mode) { target_ulong addr, len; @@ -2513,7 +2514,10 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) p->first_tb) { tb_invalidate_phys_page(addr, 0); } - p->flags = flags; + if (mode == PAGE_SET_ALL_FLAGS) + p->flags = flags; + else /* PAGE_SET_PROTECTION */ + p->flags |= (p->flags & ~(PAGE_BITS - 1)) | (flags & PAGE_BITS); } } diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c index 17f4cd80aa..4bfac81af4 100644 --- a/bsd-user/mmap.c +++ b/bsd-user/mmap.c @@ -125,7 +125,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) if (ret != 0) goto error; } - page_set_flags(start, start + len, prot | PAGE_VALID); + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); mmap_unlock(); return 0; error: @@ -214,7 +214,7 @@ static abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size) to mmap. */ page_set_flags(last_brk & TARGET_PAGE_MASK, TARGET_PAGE_ALIGN(new_brk), - PAGE_RESERVED); + PAGE_RESERVED, PAGE_SET_ALL_FLAGS); } last_brk = new_brk; @@ -397,7 +397,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, } } the_end1: - page_set_flags(start, start + len, prot | PAGE_VALID); + page_set_flags(start, start + len, prot | PAGE_VALID, PAGE_SET_ALL_FLAGS); the_end: #ifdef DEBUG_MMAP printf("ret=0x" TARGET_FMT_lx "\n", start); @@ -460,7 +460,7 @@ int target_munmap(abi_ulong start, abi_ulong len) } if (ret == 0) - page_set_flags(start, start + len, 0); + page_set_flags(start, start + len, 0, PAGE_SET_ALL_FLAGS); mmap_unlock(); return ret; } diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 117d2fbbca..100388dcd4 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -249,6 +249,9 @@ extern intptr_t qemu_host_page_mask; /* FIXME: Code that sets/uses this is broken and needs to go away. */ #define PAGE_RESERVED 0x0020 #endif +#if defined(CONFIG_LINUX) && defined(CONFIG_USER_ONLY) +#define PAGE_MAP_ANONYMOUS 0x0080 +#endif #if defined(CONFIG_USER_ONLY) void page_dump(FILE *f); @@ -257,8 +260,14 @@ typedef int (*walk_memory_regions_fn)(void *, target_ulong, target_ulong, unsigned long); int walk_memory_regions(void *, walk_memory_regions_fn); +enum page_set_flags_mode { + PAGE_SET_ALL_FLAGS, + PAGE_SET_PROTECTION +}; + int page_get_flags(target_ulong address); -void page_set_flags(target_ulong start, target_ulong end, int flags); +void page_set_flags(target_ulong start, target_ulong end, int flags, + enum page_set_flags_mode); int page_check_range(target_ulong start, target_ulong len, int flags); #endif diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 8638612aec..c59cf4359c 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -1702,7 +1702,8 @@ static void zero_bss(abi_ulong elf_bss, abi_ulong last_bss, int prot) /* Ensure that the bss page(s) are valid */ if ((page_get_flags(last_bss-1) & prot) != prot) { - page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot | PAGE_VALID); + page_set_flags(elf_bss & TARGET_PAGE_MASK, last_bss, prot, + PAGE_SET_PROTECTION); } if (host_start < host_map_start) { diff --git a/linux-user/mmap.c b/linux-user/mmap.c index 41e0983ce8..fff29ee04b 100644 --- a/linux-user/mmap.c +++ b/linux-user/mmap.c @@ -124,7 +124,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int prot) if (ret != 0) goto error; } - page_set_flags(start, start + len, prot | PAGE_VALID); + page_set_flags(start, start + len, prot, PAGE_SET_PROTECTION); mmap_unlock(); return 0; error: @@ -362,6 +362,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, int flags, int fd, abi_ulong offset) { abi_ulong ret, end, real_start, real_end, retaddr, host_offset, host_len; + int page_flags; mmap_lock(); #ifdef DEBUG_MMAP @@ -562,7 +563,11 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, } } the_end1: - page_set_flags(start, start + len, prot | PAGE_VALID); + page_flags = prot | PAGE_VALID; + if (flags & MAP_ANONYMOUS) { + page_flags |= PAGE_MAP_ANONYMOUS; + } + page_set_flags(start, start + len, page_flags, PAGE_SET_ALL_FLAGS); the_end: #ifdef DEBUG_MMAP printf("ret=0x" TARGET_ABI_FMT_lx "\n", start); @@ -675,7 +680,7 @@ int target_munmap(abi_ulong start, abi_ulong len) } if (ret == 0) { - page_set_flags(start, start + len, 0); + page_set_flags(start, start + len, 0, PAGE_SET_ALL_FLAGS); tb_invalidate_phys_range(start, start + len); } mmap_unlock(); @@ -755,10 +760,44 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, } else { new_addr = h2g(host_addr); prot = page_get_flags(old_addr); - page_set_flags(old_addr, old_addr + old_size, 0); - page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID); + page_set_flags(old_addr, old_addr + old_size, 0, + PAGE_SET_ALL_FLAGS); + page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID, + PAGE_SET_ALL_FLAGS); } tb_invalidate_phys_range(new_addr, new_addr + new_size); mmap_unlock(); return new_addr; } + +int target_madvise(abi_ulong start, abi_ulong len, int advice) +{ + abi_ulong end, addr; + int ret; + + len = TARGET_PAGE_ALIGN(len); + start &= TARGET_PAGE_MASK; + + if (!guest_range_valid(start, len)) { + errno = EINVAL; + return -1; + } + + /* A straight passthrough may not be safe because qemu sometimes + turns private file-backed mappings into anonymous mappings. + Most flags are hints so we ignore them. + One exception is made for MADV_DONTNEED on anonymous mappings, + that applications may rely on to zero out pages. */ + if (advice & MADV_DONTNEED) { + end = start + len; + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { + if (page_get_flags(addr) & PAGE_MAP_ANONYMOUS) { + ret = madvise(g2h(addr), TARGET_PAGE_SIZE, MADV_DONTNEED); + if (ret != 0) { + return ret; + } + } + } + } + return 0; +} diff --git a/linux-user/qemu.h b/linux-user/qemu.h index b4959e41c6..e5ac5e9c6a 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -437,6 +437,7 @@ int target_munmap(abi_ulong start, abi_ulong len); abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, abi_ulong new_size, unsigned long flags, abi_ulong new_addr); +int target_madvise(abi_ulong start, abi_ulong len, int advice); extern unsigned long last_brk; extern abi_ulong mmap_next_start; abi_ulong mmap_find_vma(abi_ulong, abi_ulong); diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 850b72a0c7..4478de5fc8 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -5124,7 +5124,8 @@ static inline abi_ulong do_shmat(CPUArchState *cpu_env, page_set_flags(raddr, raddr + shm_info.shm_segsz, PAGE_VALID | PAGE_READ | - ((shmflg & SHM_RDONLY)? 0 : PAGE_WRITE)); + ((shmflg & SHM_RDONLY) ? 0 : PAGE_WRITE), + PAGE_SET_ALL_FLAGS); for (i = 0; i < N_SHM_REGIONS; i++) { if (!shm_regions[i].in_use) { @@ -5150,7 +5151,8 @@ static inline abi_long do_shmdt(abi_ulong shmaddr) for (i = 0; i < N_SHM_REGIONS; ++i) { if (shm_regions[i].in_use && shm_regions[i].start == shmaddr) { shm_regions[i].in_use = false; - page_set_flags(shmaddr, shmaddr + shm_regions[i].size, 0); + page_set_flags(shmaddr, shmaddr + shm_regions[i].size, 0, + PAGE_SET_ALL_FLAGS); break; } } @@ -11559,11 +11561,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1, #ifdef TARGET_NR_madvise case TARGET_NR_madvise: - /* A straight passthrough may not be safe because qemu sometimes - turns private file-backed mappings into anonymous mappings. - This will break MADV_DONTNEED. - This is a hint, so ignoring and returning success is ok. */ - return 0; + return get_errno(target_madvise(arg1, arg2, arg3)); #endif #if TARGET_ABI_BITS == 32 case TARGET_NR_fcntl64:
Most flags to madvise() are just hints, so typically ignoring the syscall and returning okay is fine. However applications exist that do rely on MADV_DONTNEED behavior to guarantee that upon subsequent access the mapping is refreshed from the backing file or zero for anonymous mappings. The file backed mappings this is tricky - hence the original comment - but for anonymous mappings it is safe to forward. We just need to remember which those mappings are, which is now stored in the flags. Cc: Riku Voipio <riku.voipio@iki.fi> Cc: Laurent Vivier <laurent@vivier.eu> Cc: Sami Nurmenniemi <sami.nurmenniemi@qt.io> Signed-off-by: Simon Hausmann <simon.hausmann@qt.io> -- v2: - align start and len on host page size v3: - align start and len to target page size as it may be bigger than host - use immediate return in do_syscall - forward MADV_DONTNEED advice only for anonymously mapped pages - remember which mappings are anonymous in the page flags and preserve those bits across mprotect calls. --- accel/tcg/translate-all.c | 8 +++++-- bsd-user/mmap.c | 8 +++---- include/exec/cpu-all.h | 11 ++++++++- linux-user/elfload.c | 3 ++- linux-user/mmap.c | 49 +++++++++++++++++++++++++++++++++++---- linux-user/qemu.h | 1 + linux-user/syscall.c | 12 ++++------ 7 files changed, 72 insertions(+), 20 deletions(-)