diff mbox series

[v2,23/26] userfaultfd: wp: don't wake up when doing write protect

Message ID 20190212025632.28946-24-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Feb. 12, 2019, 2:56 a.m. UTC
It does not make sense to try to wake up any waiting thread when we're
write-protecting a memory region.  Only wake up when resolving a write
protected page fault.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/userfaultfd.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

Comments

Jerome Glisse Feb. 21, 2019, 6:36 p.m. UTC | #1
On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> It does not make sense to try to wake up any waiting thread when we're
> write-protecting a memory region.  Only wake up when resolving a write
> protected page fault.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

I am bit confuse here, see below.

> ---
>  fs/userfaultfd.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 81962d62520c..f1f61a0278c2 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	struct uffdio_writeprotect uffdio_wp;
>  	struct uffdio_writeprotect __user *user_uffdio_wp;
>  	struct userfaultfd_wake_range range;
> +	bool mode_wp, mode_dontwake;
>  
>  	if (READ_ONCE(ctx->mmap_changing))
>  		return -EAGAIN;
> @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
>  			       UFFDIO_WRITEPROTECT_MODE_WP))
>  		return -EINVAL;
> -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> +
> +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> +
> +	if (mode_wp && mode_dontwake)
>  		return -EINVAL;

I am confuse by the logic here. DONTWAKE means do not wake any waiting
thread right ? So if the patch header it seems to me the logic should
be:
    if (mode_wp && !mode_dontwake)
        return -EINVAL;

At very least this part does seems to mean the opposite of what the
commit message says.

>  
>  	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -				  uffdio_wp.range.len, uffdio_wp.mode &
> -				  UFFDIO_WRITEPROTECT_MODE_WP,
> +				  uffdio_wp.range.len, mode_wp,
>  				  &ctx->mmap_changing);
>  	if (ret)
>  		return ret;
>  
> -	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> +	if (!mode_wp && !mode_dontwake) {

This part match the commit message :)

>  		range.start = uffdio_wp.range.start;
>  		range.len = uffdio_wp.range.len;
>  		wake_userfault(ctx, &range);
Peter Xu Feb. 25, 2019, 8:58 a.m. UTC | #2
On Thu, Feb 21, 2019 at 01:36:54PM -0500, Jerome Glisse wrote:
> On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > It does not make sense to try to wake up any waiting thread when we're
> > write-protecting a memory region.  Only wake up when resolving a write
> > protected page fault.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> I am bit confuse here, see below.
> 
> > ---
> >  fs/userfaultfd.c | 13 ++++++++-----
> >  1 file changed, 8 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 81962d62520c..f1f61a0278c2 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> >  	struct uffdio_writeprotect uffdio_wp;
> >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> >  	struct userfaultfd_wake_range range;
> > +	bool mode_wp, mode_dontwake;
> >  
> >  	if (READ_ONCE(ctx->mmap_changing))
> >  		return -EAGAIN;
> > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> >  		return -EINVAL;
> > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))

[1]

> > +
> > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > +
> > +	if (mode_wp && mode_dontwake)

[2]

> >  		return -EINVAL;
> 
> I am confuse by the logic here. DONTWAKE means do not wake any waiting
> thread right ? So if the patch header it seems to me the logic should
> be:
>     if (mode_wp && !mode_dontwake)
>         return -EINVAL;

This should be the most common case when we want to write protect a
page (or a set of pages).  I'll explain more details below...

> 
> At very least this part does seems to mean the opposite of what the
> commit message says.

Let me paste the matrix to be clear on these flags:

  |------+-------------------------+------------------------------|
  |      | dontwake=0              | dontwake=1                   |
  |------+-------------------------+------------------------------|
  | wp=0 | (a) resolve pf, do wake | (b) resolve pf only, no wake |
  | wp=1 | (c) wp page range       | (d) invalid                  |
  |------+-------------------------+------------------------------|

Above check at [1] was checking against case (d) in the matrix.  It is
indeed an invalid condition because when we want to write protect a
page we should not try to wake up any thread, so the donewake
parameter is actually useless (we'll always do that).  And above [2]
is simply rewritting [1] with the new variables.

> 
> >  
> >  	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> > -				  uffdio_wp.range.len, uffdio_wp.mode &
> > -				  UFFDIO_WRITEPROTECT_MODE_WP,
> > +				  uffdio_wp.range.len, mode_wp,
> >  				  &ctx->mmap_changing);
> >  	if (ret)
> >  		return ret;
> >  
> > -	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> > +	if (!mode_wp && !mode_dontwake) {
> 
> This part match the commit message :)

Here is what the patch really want to change: before this patch we'll
even call wake_userfault() below for case (c) while it doesn't really
make too much sense IMHO.  After this patch we'll only do the wakeup
for (a,b).

> 
> >  		range.start = uffdio_wp.range.start;
> >  		range.len = uffdio_wp.range.len;
> >  		wake_userfault(ctx, &range);

Thanks,
Mike Rapoport Feb. 25, 2019, 9:09 p.m. UTC | #3
On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> It does not make sense to try to wake up any waiting thread when we're
> write-protecting a memory region.  Only wake up when resolving a write
> protected page fault.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/userfaultfd.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 81962d62520c..f1f61a0278c2 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	struct uffdio_writeprotect uffdio_wp;
>  	struct uffdio_writeprotect __user *user_uffdio_wp;
>  	struct userfaultfd_wake_range range;
> +	bool mode_wp, mode_dontwake;
> 
>  	if (READ_ONCE(ctx->mmap_changing))
>  		return -EAGAIN;
> @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
>  			       UFFDIO_WRITEPROTECT_MODE_WP))
>  		return -EINVAL;
> -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> +
> +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> +
> +	if (mode_wp && mode_dontwake)
>  		return -EINVAL;

This actually means the opposite of the commit message text ;-)

Is any dependency of _WP and _DONTWAKE needed at all?
 
>  	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -				  uffdio_wp.range.len, uffdio_wp.mode &
> -				  UFFDIO_WRITEPROTECT_MODE_WP,
> +				  uffdio_wp.range.len, mode_wp,
>  				  &ctx->mmap_changing);
>  	if (ret)
>  		return ret;
> 
> -	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> +	if (!mode_wp && !mode_dontwake) {
>  		range.start = uffdio_wp.range.start;
>  		range.len = uffdio_wp.range.len;
>  		wake_userfault(ctx, &range);
> -- 
> 2.17.1
>
Mike Rapoport Feb. 25, 2019, 9:15 p.m. UTC | #4
On Mon, Feb 25, 2019 at 04:58:46PM +0800, Peter Xu wrote:
> On Thu, Feb 21, 2019 at 01:36:54PM -0500, Jerome Glisse wrote:
> > On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > > It does not make sense to try to wake up any waiting thread when we're
> > > write-protecting a memory region.  Only wake up when resolving a write
> > > protected page fault.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > 
> > I am bit confuse here, see below.
> > 
> > > ---
> > >  fs/userfaultfd.c | 13 ++++++++-----
> > >  1 file changed, 8 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 81962d62520c..f1f61a0278c2 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > >  	struct uffdio_writeprotect uffdio_wp;
> > >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> > >  	struct userfaultfd_wake_range range;
> > > +	bool mode_wp, mode_dontwake;
> > >  
> > >  	if (READ_ONCE(ctx->mmap_changing))
> > >  		return -EAGAIN;
> > > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> > >  		return -EINVAL;
> > > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> 
> [1]
> 
> > > +
> > > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > > +
> > > +	if (mode_wp && mode_dontwake)
> 
> [2]
> 
> > >  		return -EINVAL;
> > 
> > I am confuse by the logic here. DONTWAKE means do not wake any waiting
> > thread right ? So if the patch header it seems to me the logic should
> > be:
> >     if (mode_wp && !mode_dontwake)
> >         return -EINVAL;
> 
> This should be the most common case when we want to write protect a
> page (or a set of pages).  I'll explain more details below...
> 
> > 
> > At very least this part does seems to mean the opposite of what the
> > commit message says.
> 
> Let me paste the matrix to be clear on these flags:
> 
>   |------+-------------------------+------------------------------|
>   |      | dontwake=0              | dontwake=1                   |
>   |------+-------------------------+------------------------------|
>   | wp=0 | (a) resolve pf, do wake | (b) resolve pf only, no wake |
>   | wp=1 | (c) wp page range       | (d) invalid                  |
>   |------+-------------------------+------------------------------|
> 
> Above check at [1] was checking against case (d) in the matrix.  It is
> indeed an invalid condition because when we want to write protect a
> page we should not try to wake up any thread, so the donewake
> parameter is actually useless (we'll always do that).  And above [2]
> is simply rewritting [1] with the new variables.

I think (c) is "wp range and wake the thread", and (d) is "wp and DONT
wake".

 
> > 
> > >  
> > >  	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> > > -				  uffdio_wp.range.len, uffdio_wp.mode &
> > > -				  UFFDIO_WRITEPROTECT_MODE_WP,
> > > +				  uffdio_wp.range.len, mode_wp,
> > >  				  &ctx->mmap_changing);
> > >  	if (ret)
> > >  		return ret;
> > >  
> > > -	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> > > +	if (!mode_wp && !mode_dontwake) {
> > 
> > This part match the commit message :)
> 
> Here is what the patch really want to change: before this patch we'll
> even call wake_userfault() below for case (c) while it doesn't really
> make too much sense IMHO.  After this patch we'll only do the wakeup
> for (a,b).

Waking up the thread after the last region is write-protected would make
sense. Not much savings for lots of ranges, though.
 
> > 
> > >  		range.start = uffdio_wp.range.start;
> > >  		range.len = uffdio_wp.range.len;
> > >  		wake_userfault(ctx, &range);
> 
> Thanks,
> 
> -- 
> Peter Xu
>
Peter Xu Feb. 26, 2019, 6:24 a.m. UTC | #5
On Mon, Feb 25, 2019 at 11:09:35PM +0200, Mike Rapoport wrote:
> On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > It does not make sense to try to wake up any waiting thread when we're
> > write-protecting a memory region.  Only wake up when resolving a write
> > protected page fault.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/userfaultfd.c | 13 ++++++++-----
> >  1 file changed, 8 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 81962d62520c..f1f61a0278c2 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> >  	struct uffdio_writeprotect uffdio_wp;
> >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> >  	struct userfaultfd_wake_range range;
> > +	bool mode_wp, mode_dontwake;
> > 
> >  	if (READ_ONCE(ctx->mmap_changing))
> >  		return -EAGAIN;
> > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> >  		return -EINVAL;
> > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > +
> > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > +
> > +	if (mode_wp && mode_dontwake)
> >  		return -EINVAL;
> 
> This actually means the opposite of the commit message text ;-)
> 
> Is any dependency of _WP and _DONTWAKE needed at all?

So this is indeed confusing at least, because both you and Jerome have
asked the same question... :)

My understanding is that we don't have any reason to wake up any
thread when we are write-protecting a range, in that sense the flag
UFFDIO_WRITEPROTECT_MODE_DONTWAKE is already meaningless in the
UFFDIO_WRITEPROTECT ioctl context.  So before everything here's how
these flags are defined:

struct uffdio_writeprotect {
	struct uffdio_range range;
	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
	__u64 mode;
};

To make it clear, we simply define it as "DONTWAKE is valid only with
!WP".  When with that, "mode_wp && mode_dontwake" is indeed a
meaningless flag combination.  Though please note that it does not
mean that the operation ("don't wake up the thread") is meaningless -
that's what we'll do no matter what when WP==1.  IMHO it's only about
the interface not the behavior.

I don't have a good way to make this clearer because firstly we'll
need the WP flag to mark whether we're protecting or unprotecting the
pages.  Later on, we need DONTWAKE for page fault handling case to
mark that we don't want to wake up the waiting thread now.  So both
the flags have their reason to stay so far.  Then with all these in
mind what I can think of is only to forbid using DONTWAKE in WP case,
and that's how above definition comes (I believe, because it was
defined that way even before I started to work on it and I think it
makes sense).

Thanks,
Mike Rapoport Feb. 26, 2019, 7:29 a.m. UTC | #6
On Tue, Feb 26, 2019 at 02:24:52PM +0800, Peter Xu wrote:
> On Mon, Feb 25, 2019 at 11:09:35PM +0200, Mike Rapoport wrote:
> > On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > > It does not make sense to try to wake up any waiting thread when we're
> > > write-protecting a memory region.  Only wake up when resolving a write
> > > protected page fault.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  fs/userfaultfd.c | 13 ++++++++-----
> > >  1 file changed, 8 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 81962d62520c..f1f61a0278c2 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > >  	struct uffdio_writeprotect uffdio_wp;
> > >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> > >  	struct userfaultfd_wake_range range;
> > > +	bool mode_wp, mode_dontwake;
> > > 
> > >  	if (READ_ONCE(ctx->mmap_changing))
> > >  		return -EAGAIN;
> > > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> > >  		return -EINVAL;
> > > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > > +
> > > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > > +
> > > +	if (mode_wp && mode_dontwake)
> > >  		return -EINVAL;
> > 
> > This actually means the opposite of the commit message text ;-)
> > 
> > Is any dependency of _WP and _DONTWAKE needed at all?
> 
> So this is indeed confusing at least, because both you and Jerome have
> asked the same question... :)
> 
> My understanding is that we don't have any reason to wake up any
> thread when we are write-protecting a range, in that sense the flag
> UFFDIO_WRITEPROTECT_MODE_DONTWAKE is already meaningless in the
> UFFDIO_WRITEPROTECT ioctl context.  So before everything here's how
> these flags are defined:
> 
> struct uffdio_writeprotect {
> 	struct uffdio_range range;
> 	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> 	__u64 mode;
> };
> 
> To make it clear, we simply define it as "DONTWAKE is valid only with
> !WP".  When with that, "mode_wp && mode_dontwake" is indeed a
> meaningless flag combination.  Though please note that it does not
> mean that the operation ("don't wake up the thread") is meaningless -
> that's what we'll do no matter what when WP==1.  IMHO it's only about
> the interface not the behavior.
> 
> I don't have a good way to make this clearer because firstly we'll
> need the WP flag to mark whether we're protecting or unprotecting the
> pages.  Later on, we need DONTWAKE for page fault handling case to
> mark that we don't want to wake up the waiting thread now.  So both
> the flags have their reason to stay so far.  Then with all these in
> mind what I can think of is only to forbid using DONTWAKE in WP case,
> and that's how above definition comes (I believe, because it was
> defined that way even before I started to work on it and I think it
> makes sense).

There's no argument how DONTWAKE can be used with !WP. The
userfaultfd_writeprotect() is called in response of the uffd monitor to WP
page fault, it asks to clear write protection to some range, but it does
not want to wake the faulting thread yet but rather it will use uffd_wake()
later.

Still, I can't grok the usage of DONTWAKE with WP=1. In my understanding,
in this case userfaultfd_writeprotect() is called unrelated to page faults,
and the monitored thread runs freely, so why it should be waked at all?

And what happens, if the thread is waiting on a missing page fault and we
do userfaultfd_writeprotect(WP=1) at the same time?

> Thanks,
> 
> -- 
> Peter Xu
>
Peter Xu Feb. 26, 2019, 7:41 a.m. UTC | #7
On Tue, Feb 26, 2019 at 09:29:33AM +0200, Mike Rapoport wrote:
> On Tue, Feb 26, 2019 at 02:24:52PM +0800, Peter Xu wrote:
> > On Mon, Feb 25, 2019 at 11:09:35PM +0200, Mike Rapoport wrote:
> > > On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > > > It does not make sense to try to wake up any waiting thread when we're
> > > > write-protecting a memory region.  Only wake up when resolving a write
> > > > protected page fault.
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  fs/userfaultfd.c | 13 ++++++++-----
> > > >  1 file changed, 8 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 81962d62520c..f1f61a0278c2 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > >  	struct uffdio_writeprotect uffdio_wp;
> > > >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > >  	struct userfaultfd_wake_range range;
> > > > +	bool mode_wp, mode_dontwake;
> > > > 
> > > >  	if (READ_ONCE(ctx->mmap_changing))
> > > >  		return -EAGAIN;
> > > > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > > >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> > > >  		return -EINVAL;
> > > > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > > > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > > > +
> > > > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > > > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > > > +
> > > > +	if (mode_wp && mode_dontwake)
> > > >  		return -EINVAL;
> > > 
> > > This actually means the opposite of the commit message text ;-)
> > > 
> > > Is any dependency of _WP and _DONTWAKE needed at all?
> > 
> > So this is indeed confusing at least, because both you and Jerome have
> > asked the same question... :)
> > 
> > My understanding is that we don't have any reason to wake up any
> > thread when we are write-protecting a range, in that sense the flag
> > UFFDIO_WRITEPROTECT_MODE_DONTWAKE is already meaningless in the
> > UFFDIO_WRITEPROTECT ioctl context.  So before everything here's how
> > these flags are defined:
> > 
> > struct uffdio_writeprotect {
> > 	struct uffdio_range range;
> > 	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> > #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> > #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> > 	__u64 mode;
> > };
> > 
> > To make it clear, we simply define it as "DONTWAKE is valid only with
> > !WP".  When with that, "mode_wp && mode_dontwake" is indeed a
> > meaningless flag combination.  Though please note that it does not
> > mean that the operation ("don't wake up the thread") is meaningless -
> > that's what we'll do no matter what when WP==1.  IMHO it's only about
> > the interface not the behavior.
> > 
> > I don't have a good way to make this clearer because firstly we'll
> > need the WP flag to mark whether we're protecting or unprotecting the
> > pages.  Later on, we need DONTWAKE for page fault handling case to
> > mark that we don't want to wake up the waiting thread now.  So both
> > the flags have their reason to stay so far.  Then with all these in
> > mind what I can think of is only to forbid using DONTWAKE in WP case,
> > and that's how above definition comes (I believe, because it was
> > defined that way even before I started to work on it and I think it
> > makes sense).
> 
> There's no argument how DONTWAKE can be used with !WP. The
> userfaultfd_writeprotect() is called in response of the uffd monitor to WP
> page fault, it asks to clear write protection to some range, but it does
> not want to wake the faulting thread yet but rather it will use uffd_wake()
> later.
> 
> Still, I can't grok the usage of DONTWAKE with WP=1. In my understanding,
> in this case userfaultfd_writeprotect() is called unrelated to page faults,
> and the monitored thread runs freely, so why it should be waked at all?

Exactly this is how I understand it.  And that's why I wrote this
patch to remove the extra wakeup() since I think it's unecessary.

> 
> And what happens, if the thread is waiting on a missing page fault and we
> do userfaultfd_writeprotect(WP=1) at the same time?

Then IMHO the userfaultfd_writeprotect() will be a noop simply because
the page is still missing.  Here if with the old code (before this
patch) we'll probably even try to wake up this thread but this thread
should just fault again on the same address due to the fact that the
page is missing.  After this patch the monitored thread should
continue to wait on the missing page.

Thanks,
Mike Rapoport Feb. 26, 2019, 8 a.m. UTC | #8
On Tue, Feb 26, 2019 at 03:41:17PM +0800, Peter Xu wrote:
> On Tue, Feb 26, 2019 at 09:29:33AM +0200, Mike Rapoport wrote:
> > On Tue, Feb 26, 2019 at 02:24:52PM +0800, Peter Xu wrote:
> > > On Mon, Feb 25, 2019 at 11:09:35PM +0200, Mike Rapoport wrote:
> > > > On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > > > > It does not make sense to try to wake up any waiting thread when we're
> > > > > write-protecting a memory region.  Only wake up when resolving a write
> > > > > protected page fault.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  fs/userfaultfd.c | 13 ++++++++-----
> > > > >  1 file changed, 8 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 81962d62520c..f1f61a0278c2 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > >  	struct uffdio_writeprotect uffdio_wp;
> > > > >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > > >  	struct userfaultfd_wake_range range;
> > > > > +	bool mode_wp, mode_dontwake;
> > > > > 
> > > > >  	if (READ_ONCE(ctx->mmap_changing))
> > > > >  		return -EAGAIN;
> > > > > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > > > >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> > > > >  		return -EINVAL;
> > > > > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > > > > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > > > > +
> > > > > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > > > > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > > > > +
> > > > > +	if (mode_wp && mode_dontwake)
> > > > >  		return -EINVAL;
> > > > 
> > > > This actually means the opposite of the commit message text ;-)
> > > > 
> > > > Is any dependency of _WP and _DONTWAKE needed at all?
> > > 
> > > So this is indeed confusing at least, because both you and Jerome have
> > > asked the same question... :)
> > > 
> > > My understanding is that we don't have any reason to wake up any
> > > thread when we are write-protecting a range, in that sense the flag
> > > UFFDIO_WRITEPROTECT_MODE_DONTWAKE is already meaningless in the
> > > UFFDIO_WRITEPROTECT ioctl context.  So before everything here's how
> > > these flags are defined:
> > > 
> > > struct uffdio_writeprotect {
> > > 	struct uffdio_range range;
> > > 	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> > > #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> > > #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> > > 	__u64 mode;
> > > };
> > > 
> > > To make it clear, we simply define it as "DONTWAKE is valid only with
> > > !WP".  When with that, "mode_wp && mode_dontwake" is indeed a
> > > meaningless flag combination.  Though please note that it does not
> > > mean that the operation ("don't wake up the thread") is meaningless -
> > > that's what we'll do no matter what when WP==1.  IMHO it's only about
> > > the interface not the behavior.
> > > 
> > > I don't have a good way to make this clearer because firstly we'll
> > > need the WP flag to mark whether we're protecting or unprotecting the
> > > pages.  Later on, we need DONTWAKE for page fault handling case to
> > > mark that we don't want to wake up the waiting thread now.  So both
> > > the flags have their reason to stay so far.  Then with all these in
> > > mind what I can think of is only to forbid using DONTWAKE in WP case,
> > > and that's how above definition comes (I believe, because it was
> > > defined that way even before I started to work on it and I think it
> > > makes sense).
> > 
> > There's no argument how DONTWAKE can be used with !WP. The
> > userfaultfd_writeprotect() is called in response of the uffd monitor to WP
> > page fault, it asks to clear write protection to some range, but it does
> > not want to wake the faulting thread yet but rather it will use uffd_wake()
> > later.
> > 
> > Still, I can't grok the usage of DONTWAKE with WP=1. In my understanding,
> > in this case userfaultfd_writeprotect() is called unrelated to page faults,
> > and the monitored thread runs freely, so why it should be waked at all?
> 
> Exactly this is how I understand it.  And that's why I wrote this
> patch to remove the extra wakeup() since I think it's unecessary.
> 
> > 
> > And what happens, if the thread is waiting on a missing page fault and we
> > do userfaultfd_writeprotect(WP=1) at the same time?
> 
> Then IMHO the userfaultfd_writeprotect() will be a noop simply because
> the page is still missing.  Here if with the old code (before this
> patch) we'll probably even try to wake up this thread but this thread
> should just fault again on the same address due to the fact that the
> page is missing.  After this patch the monitored thread should
> continue to wait on the missing page.

So, my understanding of what we have is:

userfaultfd_writeprotect() can be used either to mark a region as write
protected or to resolve WP page fault.
In the first case DONTWAKE does not make sense and we forbid setting it
with WP=1.
In the second case it's the uffd monitor decision whether to wake up the
faulting thread immediately after #PF is resolved or later, so with WP=0 we
allow DONTWAKE.

I suggest to extend the comment in the definition of 
'struct uffdio_writeprotect' to something like

/*
 * Write protecting a region (WP=1) is unrelated to page faults, therefore
 * DONTWAKE flag is meaningless with WP=1.
 * Removing write protection (WP=0) in response to a page fault wakes the
 * faulting task unless DONTWAKE is set.
 */
 
And a documentation update along these lines would be appreciated :)

> Thanks,
> 
> -- 
> Peter Xu
>
Mike Rapoport Feb. 26, 2019, 8 a.m. UTC | #9
On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> It does not make sense to try to wake up any waiting thread when we're
> write-protecting a memory region.  Only wake up when resolving a write
> protected page fault.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>

> ---
>  fs/userfaultfd.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 81962d62520c..f1f61a0278c2 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	struct uffdio_writeprotect uffdio_wp;
>  	struct uffdio_writeprotect __user *user_uffdio_wp;
>  	struct userfaultfd_wake_range range;
> +	bool mode_wp, mode_dontwake;
> 
>  	if (READ_ONCE(ctx->mmap_changing))
>  		return -EAGAIN;
> @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
>  			       UFFDIO_WRITEPROTECT_MODE_WP))
>  		return -EINVAL;
> -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> +
> +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> +
> +	if (mode_wp && mode_dontwake)
>  		return -EINVAL;
> 
>  	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -				  uffdio_wp.range.len, uffdio_wp.mode &
> -				  UFFDIO_WRITEPROTECT_MODE_WP,
> +				  uffdio_wp.range.len, mode_wp,
>  				  &ctx->mmap_changing);
>  	if (ret)
>  		return ret;
> 
> -	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
> +	if (!mode_wp && !mode_dontwake) {
>  		range.start = uffdio_wp.range.start;
>  		range.len = uffdio_wp.range.len;
>  		wake_userfault(ctx, &range);
> -- 
> 2.17.1
>
Peter Xu Feb. 28, 2019, 2:47 a.m. UTC | #10
On Tue, Feb 26, 2019 at 10:00:29AM +0200, Mike Rapoport wrote:
> On Tue, Feb 26, 2019 at 03:41:17PM +0800, Peter Xu wrote:
> > On Tue, Feb 26, 2019 at 09:29:33AM +0200, Mike Rapoport wrote:
> > > On Tue, Feb 26, 2019 at 02:24:52PM +0800, Peter Xu wrote:
> > > > On Mon, Feb 25, 2019 at 11:09:35PM +0200, Mike Rapoport wrote:
> > > > > On Tue, Feb 12, 2019 at 10:56:29AM +0800, Peter Xu wrote:
> > > > > > It does not make sense to try to wake up any waiting thread when we're
> > > > > > write-protecting a memory region.  Only wake up when resolving a write
> > > > > > protected page fault.
> > > > > > 
> > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > ---
> > > > > >  fs/userfaultfd.c | 13 ++++++++-----
> > > > > >  1 file changed, 8 insertions(+), 5 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > > index 81962d62520c..f1f61a0278c2 100644
> > > > > > --- a/fs/userfaultfd.c
> > > > > > +++ b/fs/userfaultfd.c
> > > > > > @@ -1771,6 +1771,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > > >  	struct uffdio_writeprotect uffdio_wp;
> > > > > >  	struct uffdio_writeprotect __user *user_uffdio_wp;
> > > > > >  	struct userfaultfd_wake_range range;
> > > > > > +	bool mode_wp, mode_dontwake;
> > > > > > 
> > > > > >  	if (READ_ONCE(ctx->mmap_changing))
> > > > > >  		return -EAGAIN;
> > > > > > @@ -1789,18 +1790,20 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> > > > > >  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
> > > > > >  			       UFFDIO_WRITEPROTECT_MODE_WP))
> > > > > >  		return -EINVAL;
> > > > > > -	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
> > > > > > -	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
> > > > > > +
> > > > > > +	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> > > > > > +	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
> > > > > > +
> > > > > > +	if (mode_wp && mode_dontwake)
> > > > > >  		return -EINVAL;
> > > > > 
> > > > > This actually means the opposite of the commit message text ;-)
> > > > > 
> > > > > Is any dependency of _WP and _DONTWAKE needed at all?
> > > > 
> > > > So this is indeed confusing at least, because both you and Jerome have
> > > > asked the same question... :)
> > > > 
> > > > My understanding is that we don't have any reason to wake up any
> > > > thread when we are write-protecting a range, in that sense the flag
> > > > UFFDIO_WRITEPROTECT_MODE_DONTWAKE is already meaningless in the
> > > > UFFDIO_WRITEPROTECT ioctl context.  So before everything here's how
> > > > these flags are defined:
> > > > 
> > > > struct uffdio_writeprotect {
> > > > 	struct uffdio_range range;
> > > > 	/* !WP means undo writeprotect. DONTWAKE is valid only with !WP */
> > > > #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
> > > > #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> > > > 	__u64 mode;
> > > > };
> > > > 
> > > > To make it clear, we simply define it as "DONTWAKE is valid only with
> > > > !WP".  When with that, "mode_wp && mode_dontwake" is indeed a
> > > > meaningless flag combination.  Though please note that it does not
> > > > mean that the operation ("don't wake up the thread") is meaningless -
> > > > that's what we'll do no matter what when WP==1.  IMHO it's only about
> > > > the interface not the behavior.
> > > > 
> > > > I don't have a good way to make this clearer because firstly we'll
> > > > need the WP flag to mark whether we're protecting or unprotecting the
> > > > pages.  Later on, we need DONTWAKE for page fault handling case to
> > > > mark that we don't want to wake up the waiting thread now.  So both
> > > > the flags have their reason to stay so far.  Then with all these in
> > > > mind what I can think of is only to forbid using DONTWAKE in WP case,
> > > > and that's how above definition comes (I believe, because it was
> > > > defined that way even before I started to work on it and I think it
> > > > makes sense).
> > > 
> > > There's no argument how DONTWAKE can be used with !WP. The
> > > userfaultfd_writeprotect() is called in response of the uffd monitor to WP
> > > page fault, it asks to clear write protection to some range, but it does
> > > not want to wake the faulting thread yet but rather it will use uffd_wake()
> > > later.
> > > 
> > > Still, I can't grok the usage of DONTWAKE with WP=1. In my understanding,
> > > in this case userfaultfd_writeprotect() is called unrelated to page faults,
> > > and the monitored thread runs freely, so why it should be waked at all?
> > 
> > Exactly this is how I understand it.  And that's why I wrote this
> > patch to remove the extra wakeup() since I think it's unecessary.
> > 
> > > 
> > > And what happens, if the thread is waiting on a missing page fault and we
> > > do userfaultfd_writeprotect(WP=1) at the same time?
> > 
> > Then IMHO the userfaultfd_writeprotect() will be a noop simply because
> > the page is still missing.  Here if with the old code (before this
> > patch) we'll probably even try to wake up this thread but this thread
> > should just fault again on the same address due to the fact that the
> > page is missing.  After this patch the monitored thread should
> > continue to wait on the missing page.
> 
> So, my understanding of what we have is:
> 
> userfaultfd_writeprotect() can be used either to mark a region as write
> protected or to resolve WP page fault.
> In the first case DONTWAKE does not make sense and we forbid setting it
> with WP=1.
> In the second case it's the uffd monitor decision whether to wake up the
> faulting thread immediately after #PF is resolved or later, so with WP=0 we
> allow DONTWAKE.

Yes exactly.

> 
> I suggest to extend the comment in the definition of 
> 'struct uffdio_writeprotect' to something like
> 
> /*
>  * Write protecting a region (WP=1) is unrelated to page faults, therefore
>  * DONTWAKE flag is meaningless with WP=1.
>  * Removing write protection (WP=0) in response to a page fault wakes the
>  * faulting task unless DONTWAKE is set.
>  */
>  
> And a documentation update along these lines would be appreciated :)

Thanks for the write-up!  I'm stoling the whole paragraph into the
patch where uffdio_writeprotect is introduced.

Regards,
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 81962d62520c..f1f61a0278c2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1771,6 +1771,7 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	struct uffdio_writeprotect uffdio_wp;
 	struct uffdio_writeprotect __user *user_uffdio_wp;
 	struct userfaultfd_wake_range range;
+	bool mode_wp, mode_dontwake;
 
 	if (READ_ONCE(ctx->mmap_changing))
 		return -EAGAIN;
@@ -1789,18 +1790,20 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
 			       UFFDIO_WRITEPROTECT_MODE_WP))
 		return -EINVAL;
-	if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
-	     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
+
+	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
+	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+
+	if (mode_wp && mode_dontwake)
 		return -EINVAL;
 
 	ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
-				  uffdio_wp.range.len, uffdio_wp.mode &
-				  UFFDIO_WRITEPROTECT_MODE_WP,
+				  uffdio_wp.range.len, mode_wp,
 				  &ctx->mmap_changing);
 	if (ret)
 		return ret;
 
-	if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
+	if (!mode_wp && !mode_dontwake) {
 		range.start = uffdio_wp.range.start;
 		range.len = uffdio_wp.range.len;
 		wake_userfault(ctx, &range);