diff mbox series

[V9,4/6] util/mmap-alloc: support MAP_SYNC in qemu_ram_mmap()

Message ID 64bea1ff5f80647cc4592ee94d399d647bdd9862.1547624239.git.yi.z.zhang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series support MAP_SYNC for memory-backend-file | expand

Commit Message

Zhang, Yi Jan. 16, 2019, 8:10 a.m. UTC
When a file supporting DAX is used as vNVDIMM backend, mmap it with
MAP_SYNC flag in addition which can ensure file system metadata
synced in each guest writes to the backend file, without other QEMU
actions (e.g., periodic fsync() by QEMU).

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
---
 include/qemu/mmap-alloc.h |  1 +
 include/qemu/osdep.h      | 16 ++++++++++++++++
 util/mmap-alloc.c         |  7 ++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin Jan. 16, 2019, 3:58 p.m. UTC | #1
On Wed, Jan 16, 2019 at 04:10:58PM +0800, Zhang Yi wrote:
> When a file supporting DAX is used as vNVDIMM backend, mmap it with
> MAP_SYNC flag in addition which can ensure file system metadata
> synced in each guest writes to the backend file, without other QEMU
> actions (e.g., periodic fsync() by QEMU).
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> ---
>  include/qemu/mmap-alloc.h |  1 +
>  include/qemu/osdep.h      | 16 ++++++++++++++++
>  util/mmap-alloc.c         |  7 ++++++-
>  3 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index 6fe6ed4..a95d91c 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -18,6 +18,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
>   *  @flags: specifies additional properties of the mapping, which can be one or
>   *          bit-or of following values
>   *          - RAM_SHARED: mmap with MAP_SHARED flag
> + *          - RAM_PMEM: mmap with MAP_SYNC flag
>   *          Other bits are ignored.
>   *
>   * Return:
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index 457d24e..27a6bfe 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -419,6 +419,22 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>  #  define QEMU_VMALLOC_ALIGN getpagesize()
>  #endif
>  
> +/*
> + * MAP_SHARED_VALIDATE and MAP_SYNC are introduced in Linux kernel
> + * 4.15, so they may not be defined when compiling on older kernels.
> + */
> +#ifdef CONFIG_LINUX
> +
> +#include <asm-generic/mman.h>

I suspect this is a wrong way to pull in this header.

You are normally supposed to use
       #include <linux/mman.h>

but see below.


> +
> +#ifndef MAP_SYNC
> +#define MAP_SYNC 0x0
> +#endif

Oh that's bad.

So if you run with a new kernel but
your installed headers are old, you get MAP_SYNC 0
and no persistence transparently with no warning.



> +
> +#else  /* !CONFIG_LINUX */
> +#define MAP_SYNC              0x0
> +#endif /* CONFIG_LINUX */
> +
>  #ifdef CONFIG_POSIX
>  struct qemu_signalfd_siginfo {
>      uint32_t ssi_signo;   /* Signal number */
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 8f0a740..cba961c 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -99,6 +99,8 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
>      void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>  #endif
>      bool shared = flags & RAM_SHARED;
> +    bool is_pmem = flags & RAM_PMEM;
> +    int mmap_xflags = 0;
>      size_t offset;
>      void *ptr1;
>  
> @@ -109,12 +111,15 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
>      assert(is_power_of_2(align));
>      /* Always align to host page size */
>      assert(align >= getpagesize());
> +    if (shared && is_pmem) {
> +        mmap_xflags |= MAP_SYNC;
> +    }
>  
>      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
>      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
>                  MAP_FIXED |
>                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> -                (shared ? MAP_SHARED : MAP_PRIVATE),
> +                (shared ? MAP_SHARED : MAP_PRIVATE) | mmap_xflags,
>                  fd, 0);
>      if (ptr1 == MAP_FAILED) {
>          munmap(ptr, total);
> -- 
> 2.7.4
Eduardo Habkost Jan. 18, 2019, 6:11 p.m. UTC | #2
On Wed, Jan 16, 2019 at 10:58:44AM -0500, Michael S. Tsirkin wrote:
> On Wed, Jan 16, 2019 at 04:10:58PM +0800, Zhang Yi wrote:
> > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > MAP_SYNC flag in addition which can ensure file system metadata
> > synced in each guest writes to the backend file, without other QEMU
> > actions (e.g., periodic fsync() by QEMU).
> > 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > ---
> >  include/qemu/mmap-alloc.h |  1 +
> >  include/qemu/osdep.h      | 16 ++++++++++++++++
> >  util/mmap-alloc.c         |  7 ++++++-
> >  3 files changed, 23 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> > index 6fe6ed4..a95d91c 100644
> > --- a/include/qemu/mmap-alloc.h
> > +++ b/include/qemu/mmap-alloc.h
> > @@ -18,6 +18,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
> >   *  @flags: specifies additional properties of the mapping, which can be one or
> >   *          bit-or of following values
> >   *          - RAM_SHARED: mmap with MAP_SHARED flag
> > + *          - RAM_PMEM: mmap with MAP_SYNC flag
> >   *          Other bits are ignored.
> >   *
> >   * Return:
> > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > index 457d24e..27a6bfe 100644
> > --- a/include/qemu/osdep.h
> > +++ b/include/qemu/osdep.h
> > @@ -419,6 +419,22 @@ void qemu_anon_ram_free(void *ptr, size_t size);
> >  #  define QEMU_VMALLOC_ALIGN getpagesize()
> >  #endif
> >  
> > +/*
> > + * MAP_SHARED_VALIDATE and MAP_SYNC are introduced in Linux kernel
> > + * 4.15, so they may not be defined when compiling on older kernels.
> > + */
> > +#ifdef CONFIG_LINUX
> > +
> > +#include <asm-generic/mman.h>
> 
> I suspect this is a wrong way to pull in this header.
> 
> You are normally supposed to use
>        #include <linux/mman.h>
> 
> but see below.
> 
> 
> > +
> > +#ifndef MAP_SYNC
> > +#define MAP_SYNC 0x0
> > +#endif
> 
> Oh that's bad.
> 
> So if you run with a new kernel but
> your installed headers are old, you get MAP_SYNC 0
> and no persistence transparently with no warning.

Yes. The semantics of the command-line to not change depending on
build time circumstances.

Anyway, I see a more fundamental problem in each version of this
patch: the semantics of the command-line options are not clearly
documented.

We have at least 3 different possible use cases we might need to
support:

1) pmem=on, MAP_SYNC not desired
2) pmem=on, MAP_SYNC desired but optional
3) pmem=on, MAP_SYNC required, not optional

Which cases from the list above we need to support?

From the cases above, what's the expected semantics of "pmem=on"
with no extra options?

If these questions are not answered (in the commit message and
user documentation), we won't be able to review and discuss the
code.


> 
> > +
> > +#else  /* !CONFIG_LINUX */
> > +#define MAP_SYNC              0x0
> > +#endif /* CONFIG_LINUX */
> > +
> >  #ifdef CONFIG_POSIX
> >  struct qemu_signalfd_siginfo {
> >      uint32_t ssi_signo;   /* Signal number */
> > diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> > index 8f0a740..cba961c 100644
> > --- a/util/mmap-alloc.c
> > +++ b/util/mmap-alloc.c
> > @@ -99,6 +99,8 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
> >      void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> >  #endif
> >      bool shared = flags & RAM_SHARED;
> > +    bool is_pmem = flags & RAM_PMEM;
> > +    int mmap_xflags = 0;
> >      size_t offset;
> >      void *ptr1;
> >  
> > @@ -109,12 +111,15 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
> >      assert(is_power_of_2(align));
> >      /* Always align to host page size */
> >      assert(align >= getpagesize());
> > +    if (shared && is_pmem) {
> > +        mmap_xflags |= MAP_SYNC;
> > +    }
> >  
> >      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
> >      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
> >                  MAP_FIXED |
> >                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> > -                (shared ? MAP_SHARED : MAP_PRIVATE),
> > +                (shared ? MAP_SHARED : MAP_PRIVATE) | mmap_xflags,
> >                  fd, 0);
> >      if (ptr1 == MAP_FAILED) {
> >          munmap(ptr, total);
> > -- 
> > 2.7.4
>
Zhang, Yi Jan. 21, 2019, 5:15 a.m. UTC | #3
On 2019-01-18 at 16:11:47 -0200, Eduardo Habkost wrote:
> On Wed, Jan 16, 2019 at 10:58:44AM -0500, Michael S. Tsirkin wrote:
> > On Wed, Jan 16, 2019 at 04:10:58PM +0800, Zhang Yi wrote:
> > > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > > MAP_SYNC flag in addition which can ensure file system metadata
> > > synced in each guest writes to the backend file, without other QEMU
> > > actions (e.g., periodic fsync() by QEMU).
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > > ---
> > >  include/qemu/mmap-alloc.h |  1 +
> > >  include/qemu/osdep.h      | 16 ++++++++++++++++
> > >  util/mmap-alloc.c         |  7 ++++++-
> > >  3 files changed, 23 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> > > index 6fe6ed4..a95d91c 100644
> > > --- a/include/qemu/mmap-alloc.h
> > > +++ b/include/qemu/mmap-alloc.h
> > > @@ -18,6 +18,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
> > >   *  @flags: specifies additional properties of the mapping, which can be one or
> > >   *          bit-or of following values
> > >   *          - RAM_SHARED: mmap with MAP_SHARED flag
> > > + *          - RAM_PMEM: mmap with MAP_SYNC flag
> > >   *          Other bits are ignored.
> > >   *
> > >   * Return:
> > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > > index 457d24e..27a6bfe 100644
> > > --- a/include/qemu/osdep.h
> > > +++ b/include/qemu/osdep.h
> > > @@ -419,6 +419,22 @@ void qemu_anon_ram_free(void *ptr, size_t size);
> > >  #  define QEMU_VMALLOC_ALIGN getpagesize()
> > >  #endif
> > >  
> > > +/*
> > > + * MAP_SHARED_VALIDATE and MAP_SYNC are introduced in Linux kernel
> > > + * 4.15, so they may not be defined when compiling on older kernels.
> > > + */
> > > +#ifdef CONFIG_LINUX
> > > +
> > > +#include <asm-generic/mman.h>
> > 
> > I suspect this is a wrong way to pull in this header.
> > 
> > You are normally supposed to use
> >        #include <linux/mman.h>
> > 
> > but see below.
> > 
> > 
> > > +
> > > +#ifndef MAP_SYNC
> > > +#define MAP_SYNC 0x0
> > > +#endif
> > 
> > Oh that's bad.
> > 
> > So if you run with a new kernel but
> > your installed headers are old, you get MAP_SYNC 0
> > and no persistence transparently with no warning.
> 
> Yes. The semantics of the command-line to not change depending on
> build time circumstances.
> 
> Anyway, I see a more fundamental problem in each version of this
> patch: the semantics of the command-line options are not clearly
> documented.
> 
> We have at least 3 different possible use cases we might need to
> support:
> 
> 1) pmem=on, MAP_SYNC not desired
> 2) pmem=on, MAP_SYNC desired but optional

Form V9, As Michael suggest, We removed the sync option, MAP_SYNC will
force on while we set pmem=on. So we only have 2 user cases, Will update
to user documentation.
1) pmem=on, MAP_SYNC not desired
We will not pass the flag to mmap2
2) pmem=on, MAP_SYNC desired
We will pass the flag to mmap2

> 3) pmem=on, MAP_SYNC required, not optional
> 
> Which cases from the list above we need to support?
> 
> From the cases above, what's the expected semantics of "pmem=on"
> with no extra options?

> 
> If these questions are not answered (in the commit message and
> user documentation), we won't be able to review and discuss the
> code.
> 
> 
> > 
> > > +
> > > +#else  /* !CONFIG_LINUX */
> > > +#define MAP_SYNC              0x0
> > > +#endif /* CONFIG_LINUX */
> > > +
> > >  #ifdef CONFIG_POSIX
> > >  struct qemu_signalfd_siginfo {
> > >      uint32_t ssi_signo;   /* Signal number */
> > > diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> > > index 8f0a740..cba961c 100644
> > > --- a/util/mmap-alloc.c
> > > +++ b/util/mmap-alloc.c
> > > @@ -99,6 +99,8 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
> > >      void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> > >  #endif
> > >      bool shared = flags & RAM_SHARED;
> > > +    bool is_pmem = flags & RAM_PMEM;
> > > +    int mmap_xflags = 0;
> > >      size_t offset;
> > >      void *ptr1;
> > >  
> > > @@ -109,12 +111,15 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
> > >      assert(is_power_of_2(align));
> > >      /* Always align to host page size */
> > >      assert(align >= getpagesize());
> > > +    if (shared && is_pmem) {
> > > +        mmap_xflags |= MAP_SYNC;
> > > +    }
> > >  
> > >      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
> > >      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
> > >                  MAP_FIXED |
> > >                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> > > -                (shared ? MAP_SHARED : MAP_PRIVATE),
> > > +                (shared ? MAP_SHARED : MAP_PRIVATE) | mmap_xflags,
> > >                  fd, 0);
> > >      if (ptr1 == MAP_FAILED) {
> > >          munmap(ptr, total);
> > > -- 
> > > 2.7.4
> > 
> 
> -- 
> Eduardo
Eduardo Habkost Jan. 21, 2019, 2:44 p.m. UTC | #4
On Mon, Jan 21, 2019 at 01:15:36PM +0800, Yi Zhang wrote:
> On 2019-01-18 at 16:11:47 -0200, Eduardo Habkost wrote:
[...]
> > Anyway, I see a more fundamental problem in each version of this
> > patch: the semantics of the command-line options are not clearly
> > documented.
> > 
> > We have at least 3 different possible use cases we might need to
> > support:
> > 
> > 1) pmem=on, MAP_SYNC not desired
> > 2) pmem=on, MAP_SYNC desired but optional
> 
> Form V9, As Michael suggest, We removed the sync option, MAP_SYNC will
> force on while we set pmem=on. So we only have 2 user cases, Will update
> to user documentation.
> 1) pmem=on, MAP_SYNC not desired
> We will not pass the flag to mmap2

If this use case is supported, how the command-line should look
like to enable it?


> 2) pmem=on, MAP_SYNC desired
> We will pass the flag to mmap2

Same question as above: how the command-line should look like for
this use case?

> 
> > 3) pmem=on, MAP_SYNC required, not optional
> > 
> > Which cases from the list above we need to support?
> > 
> > From the cases above, what's the expected semantics of "pmem=on"
> > with no extra options?

We still need to answer that question.

The current semantics of pmem=on (with no extra options) is (1).
It looks like we can't change it to (2) without breaking existing
configurations.  If you make existing configurations stop working
on hosts where they currently work, you need to explain why it's
OK to do that.


> > 
> > If these questions are not answered (in the commit message and
> > user documentation), we won't be able to review and discuss the
> > code.
> > 
> > 
[...]
Zhang, Yi Jan. 22, 2019, 3:21 a.m. UTC | #5
On 2019-01-21 at 12:44:00 -0200, Eduardo Habkost wrote:
> On Mon, Jan 21, 2019 at 01:15:36PM +0800, Yi Zhang wrote:
> > On 2019-01-18 at 16:11:47 -0200, Eduardo Habkost wrote:
> [...]
> > > Anyway, I see a more fundamental problem in each version of this
> > > patch: the semantics of the command-line options are not clearly
> > > documented.
> > > 
> > > We have at least 3 different possible use cases we might need to
> > > support:
> > > 
> > > 1) pmem=on, MAP_SYNC not desired
> > > 2) pmem=on, MAP_SYNC desired but optional
> > 
> > Form V9, As Michael suggest, We removed the sync option, MAP_SYNC will
> > force on while we set pmem=on. So we only have 2 user cases, Will update
> > to user documentation.
> > 1) pmem=on, MAP_SYNC not desired
> > We will not pass the flag to mmap2
> 
> If this use case is supported, how the command-line should look
> like to enable it?
> 
> 
> > 2) pmem=on, MAP_SYNC desired
> > We will pass the flag to mmap2
> 
> Same question as above: how the command-line should look like for
> this use case?
Sorry, I got some miss-understood of the MAP_SYNC desired.
As we talk with Micheal:

we give up on a bit of flexibility, and just say
pmem=on forces MAP_SYNC. on a MAP_SYNC capable configrations(kernel+
backend dax)

Current user case is like below:

 1. pmem=on is set, shared=on is set, MAP_SYNC supported in linux kernel:
        a: backend is a dax supporting file.
	 - MAP_SYNC will active.
	b: backedn is not a dax supporting file.
	 - mmap will result in an EOPNOTSUPP error.

2. The reset of cases:
	- we will never pass the MAP_SYNC to mmap2

> 
> > 
> > > 3) pmem=on, MAP_SYNC required, not optional
> > > 
> > > Which cases from the list above we need to support?
> > > 
> > > From the cases above, what's the expected semantics of "pmem=on"
> > > with no extra options?
> 
> We still need to answer that question.
> 
> The current semantics of pmem=on (with no extra options) is (1).
> It looks like we can't change it to (2) without breaking existing
> configurations.  If you make existing configurations stop working
> on hosts where they currently work, you need to explain why it's
> OK to do that.
> 
> 
> > > 
> > > If these questions are not answered (in the commit message and
> > > user documentation), we won't be able to review and discuss the
> > > code.
> > > 
> > > 
> [...]
> 
> 
> -- 
> Eduardo
>
Michael S. Tsirkin Jan. 22, 2019, 3:27 a.m. UTC | #6
On Tue, Jan 22, 2019 at 11:21:25AM +0800, Yi Zhang wrote:
> On 2019-01-21 at 12:44:00 -0200, Eduardo Habkost wrote:
> > On Mon, Jan 21, 2019 at 01:15:36PM +0800, Yi Zhang wrote:
> > > On 2019-01-18 at 16:11:47 -0200, Eduardo Habkost wrote:
> > [...]
> > > > Anyway, I see a more fundamental problem in each version of this
> > > > patch: the semantics of the command-line options are not clearly
> > > > documented.
> > > > 
> > > > We have at least 3 different possible use cases we might need to
> > > > support:
> > > > 
> > > > 1) pmem=on, MAP_SYNC not desired
> > > > 2) pmem=on, MAP_SYNC desired but optional
> > > 
> > > Form V9, As Michael suggest, We removed the sync option, MAP_SYNC will
> > > force on while we set pmem=on. So we only have 2 user cases, Will update
> > > to user documentation.
> > > 1) pmem=on, MAP_SYNC not desired
> > > We will not pass the flag to mmap2
> > 
> > If this use case is supported, how the command-line should look
> > like to enable it?
> > 
> > 
> > > 2) pmem=on, MAP_SYNC desired
> > > We will pass the flag to mmap2
> > 
> > Same question as above: how the command-line should look like for
> > this use case?
> Sorry, I got some miss-understood of the MAP_SYNC desired.
> As we talk with Micheal:
> 
> we give up on a bit of flexibility, and just say
> pmem=on forces MAP_SYNC. on a MAP_SYNC capable configrations(kernel+
> backend dax)
> 
> Current user case is like below:
> 
>  1. pmem=on is set, shared=on is set, MAP_SYNC supported in linux kernel:
>         a: backend is a dax supporting file.
> 	 - MAP_SYNC will active.
> 	b: backedn is not a dax supporting file.
> 	 - mmap will result in an EOPNOTSUPP error.
> 
> 2. The reset of cases:
> 	- we will never pass the MAP_SYNC to mmap2

I don't see code probing for MAP_SYNC support. Did I miss it?
But if all you want is to have old linux ignore MAP_SYNC,
I think you got your wish automatically - just do not set
MAP_SHARED_VALIDATE.


> > 
> > > 
> > > > 3) pmem=on, MAP_SYNC required, not optional
> > > > 
> > > > Which cases from the list above we need to support?
> > > > 
> > > > From the cases above, what's the expected semantics of "pmem=on"
> > > > with no extra options?
> > 
> > We still need to answer that question.
> > 
> > The current semantics of pmem=on (with no extra options) is (1).
> > It looks like we can't change it to (2) without breaking existing
> > configurations.  If you make existing configurations stop working
> > on hosts where they currently work, you need to explain why it's
> > OK to do that.
> > 
> > 
> > > > 
> > > > If these questions are not answered (in the commit message and
> > > > user documentation), we won't be able to review and discuss the
> > > > code.
> > > > 
> > > > 
> > [...]
> > 
> > 
> > -- 
> > Eduardo
> >
Dan Williams Jan. 22, 2019, 5:33 p.m. UTC | #7
On Mon, Jan 21, 2019 at 7:27 PM Michael S. Tsirkin <mst@redhat.com> wrote:
[..]
> > 2. The reset of cases:
> >       - we will never pass the MAP_SYNC to mmap2
>
> I don't see code probing for MAP_SYNC support. Did I miss it?
> But if all you want is to have old linux ignore MAP_SYNC,
> I think you got your wish automatically - just do not set
> MAP_SHARED_VALIDATE.

That will also cause new Linux to ignore MAP_SYNC.
Michael S. Tsirkin Jan. 22, 2019, 6:47 p.m. UTC | #8
On Tue, Jan 22, 2019 at 09:33:37AM -0800, Dan Williams wrote:
> On Mon, Jan 21, 2019 at 7:27 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> [..]
> > > 2. The reset of cases:
> > >       - we will never pass the MAP_SYNC to mmap2
> >
> > I don't see code probing for MAP_SYNC support. Did I miss it?
> > But if all you want is to have old linux ignore MAP_SYNC,
> > I think you got your wish automatically - just do not set
> > MAP_SHARED_VALIDATE.
> 
> That will also cause new Linux to ignore MAP_SYNC.

Oh you are right. I missed this point.

And given that these patches do not seem to set MAP_SHARED_VALIDATE
at all I conclude that even though thet set MAP_SYNC
it actually has no effect at all.

So I wonder how they were tested.
Would the contributors care to elaborate?
That would be good info to put in the commit log message.
diff mbox series

Patch

diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index 6fe6ed4..a95d91c 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -18,6 +18,7 @@  size_t qemu_mempath_getpagesize(const char *mem_path);
  *  @flags: specifies additional properties of the mapping, which can be one or
  *          bit-or of following values
  *          - RAM_SHARED: mmap with MAP_SHARED flag
+ *          - RAM_PMEM: mmap with MAP_SYNC flag
  *          Other bits are ignored.
  *
  * Return:
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 457d24e..27a6bfe 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -419,6 +419,22 @@  void qemu_anon_ram_free(void *ptr, size_t size);
 #  define QEMU_VMALLOC_ALIGN getpagesize()
 #endif
 
+/*
+ * MAP_SHARED_VALIDATE and MAP_SYNC are introduced in Linux kernel
+ * 4.15, so they may not be defined when compiling on older kernels.
+ */
+#ifdef CONFIG_LINUX
+
+#include <asm-generic/mman.h>
+
+#ifndef MAP_SYNC
+#define MAP_SYNC 0x0
+#endif
+
+#else  /* !CONFIG_LINUX */
+#define MAP_SYNC              0x0
+#endif /* CONFIG_LINUX */
+
 #ifdef CONFIG_POSIX
 struct qemu_signalfd_siginfo {
     uint32_t ssi_signo;   /* Signal number */
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 8f0a740..cba961c 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -99,6 +99,8 @@  void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
     void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 #endif
     bool shared = flags & RAM_SHARED;
+    bool is_pmem = flags & RAM_PMEM;
+    int mmap_xflags = 0;
     size_t offset;
     void *ptr1;
 
@@ -109,12 +111,15 @@  void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t flags)
     assert(is_power_of_2(align));
     /* Always align to host page size */
     assert(align >= getpagesize());
+    if (shared && is_pmem) {
+        mmap_xflags |= MAP_SYNC;
+    }
 
     offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
     ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
                 MAP_FIXED |
                 (fd == -1 ? MAP_ANONYMOUS : 0) |
-                (shared ? MAP_SHARED : MAP_PRIVATE),
+                (shared ? MAP_SHARED : MAP_PRIVATE) | mmap_xflags,
                 fd, 0);
     if (ptr1 == MAP_FAILED) {
         munmap(ptr, total);