diff mbox series

[v2] epoll: add nsec timeout support

Message ID 20201116161001.1606608-1-willemdebruijn.kernel@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] epoll: add nsec timeout support | expand

Commit Message

Willem de Bruijn Nov. 16, 2020, 4:10 p.m. UTC
From: Willem de Bruijn <willemb@google.com>

Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
interpretation of argument timeout in epoll_wait from msec to nsec.

Use cases such as datacenter networking operate on timescales well
below milliseconds. Shorter timeouts bounds their tail latency.
The underlying hrtimer is already programmed with nsec resolution.

Changes (v2):
  - cast to s64: avoid overflow on 32-bit platforms (Shuo Chen)
  - minor commit message rewording

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Applies cleanly both to 5.10-rc4 and next-20201116.
In next, nstimeout no longer fills padding with new field refs.

Selftest for now at github. Can follow-up for kselftests.
https://github.com/wdebruij/kerneltools/blob/master/tests/epoll_nstimeo.c
---
 fs/eventpoll.c                 | 26 +++++++++++++++++++-------
 include/uapi/linux/eventpoll.h |  1 +
 2 files changed, 20 insertions(+), 7 deletions(-)

Comments

Soheil Hassas Yeganeh Nov. 16, 2020, 4:12 p.m. UTC | #1
On Mon, Nov 16, 2020 at 11:10 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> From: Willem de Bruijn <willemb@google.com>
>
> Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> interpretation of argument timeout in epoll_wait from msec to nsec.
>
> Use cases such as datacenter networking operate on timescales well
> below milliseconds. Shorter timeouts bounds their tail latency.
> The underlying hrtimer is already programmed with nsec resolution.
>
> Changes (v2):
>   - cast to s64: avoid overflow on 32-bit platforms (Shuo Chen)
>   - minor commit message rewording
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

> ---
>
> Applies cleanly both to 5.10-rc4 and next-20201116.
> In next, nstimeout no longer fills padding with new field refs.
>
> Selftest for now at github. Can follow-up for kselftests.
> https://github.com/wdebruij/kerneltools/blob/master/tests/epoll_nstimeo.c
> ---
>  fs/eventpoll.c                 | 26 +++++++++++++++++++-------
>  include/uapi/linux/eventpoll.h |  1 +
>  2 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 4df61129566d..817d9cc5b8b8 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -225,6 +225,9 @@ struct eventpoll {
>         unsigned int napi_id;
>  #endif
>
> +       /* Accept timeout in ns resolution (EPOLL_NSTIMEO) */
> +       unsigned int nstimeout:1;
> +
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>         /* tracks wakeup nests for lockdep validation */
>         u8 nests;
> @@ -1787,17 +1790,20 @@ static int ep_send_events(struct eventpoll *ep,
>         return esed.res;
>  }
>
> -static inline struct timespec64 ep_set_mstimeout(long ms)
> +static inline struct timespec64 ep_set_nstimeout(s64 ns)
>  {
> -       struct timespec64 now, ts = {
> -               .tv_sec = ms / MSEC_PER_SEC,
> -               .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
> -       };
> +       struct timespec64 now, ts;
>
> +       ts = ns_to_timespec64(ns);
>         ktime_get_ts64(&now);
>         return timespec64_add_safe(now, ts);
>  }
>
> +static inline struct timespec64 ep_set_mstimeout(long ms)
> +{
> +       return ep_set_nstimeout(ms * (s64)NSEC_PER_MSEC);
> +}
> +
>  /**
>   * ep_poll - Retrieves ready events, and delivers them to the caller supplied
>   *           event buffer.
> @@ -1826,7 +1832,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>         lockdep_assert_irqs_enabled();
>
>         if (timeout > 0) {
> -               struct timespec64 end_time = ep_set_mstimeout(timeout);
> +               struct timespec64 end_time;
> +
> +               end_time = ep->nstimeout ? ep_set_nstimeout(timeout) :
> +                                          ep_set_mstimeout(timeout);
>
>                 slack = select_estimate_accuracy(&end_time);
>                 to = &expires;
> @@ -2046,7 +2055,7 @@ static int do_epoll_create(int flags)
>         /* Check the EPOLL_* constant for consistency.  */
>         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
>
> -       if (flags & ~EPOLL_CLOEXEC)
> +       if (flags & ~(EPOLL_CLOEXEC | EPOLL_NSTIMEO))
>                 return -EINVAL;
>         /*
>          * Create the internal data structure ("struct eventpoll").
> @@ -2054,6 +2063,9 @@ static int do_epoll_create(int flags)
>         error = ep_alloc(&ep);
>         if (error < 0)
>                 return error;
> +
> +       ep->nstimeout = !!(flags & EPOLL_NSTIMEO);
> +
>         /*
>          * Creates all the items needed to setup an eventpoll file. That is,
>          * a file structure and a free file descriptor.
> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> index 8a3432d0f0dc..f6ef9c9f8ac2 100644
> --- a/include/uapi/linux/eventpoll.h
> +++ b/include/uapi/linux/eventpoll.h
> @@ -21,6 +21,7 @@
>
>  /* Flags for epoll_create1.  */
>  #define EPOLL_CLOEXEC O_CLOEXEC
> +#define EPOLL_NSTIMEO 0x1
>
>  /* Valid opcodes to issue to sys_epoll_ctl() */
>  #define EPOLL_CTL_ADD 1
> --
> 2.29.2.299.gdc1121823c-goog
>
Matthew Wilcox Nov. 16, 2020, 4:19 p.m. UTC | #2
On Mon, Nov 16, 2020 at 11:10:01AM -0500, Willem de Bruijn wrote:
> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> index 8a3432d0f0dc..f6ef9c9f8ac2 100644
> --- a/include/uapi/linux/eventpoll.h
> +++ b/include/uapi/linux/eventpoll.h
> @@ -21,6 +21,7 @@
>  
>  /* Flags for epoll_create1.  */
>  #define EPOLL_CLOEXEC O_CLOEXEC
> +#define EPOLL_NSTIMEO 0x1
>  
>  /* Valid opcodes to issue to sys_epoll_ctl() */
>  #define EPOLL_CTL_ADD 1

Not a problem with your patch, but this concerns me.  O_CLOEXEC is
defined differently for each architecture, so we need to stay out of
several different bits when we define new flags for EPOLL_*.  Maybe
this:

/*
 * Flags for epoll_create1.  O_CLOEXEC may be different bits, depending
 * on the CPU architecture.  Reserve the known ones.
 */
#define EPOLL_CLOEXEC		O_CLOEXEC
#define EPOLL_RESERVED_FLAGS	0x00680000
#define EPOLL_NSTIMEO		0x00000001
Willem de Bruijn Nov. 16, 2020, 5 p.m. UTC | #3
On Mon, Nov 16, 2020 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Nov 16, 2020 at 11:10:01AM -0500, Willem de Bruijn wrote:
> > diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> > index 8a3432d0f0dc..f6ef9c9f8ac2 100644
> > --- a/include/uapi/linux/eventpoll.h
> > +++ b/include/uapi/linux/eventpoll.h
> > @@ -21,6 +21,7 @@
> >
> >  /* Flags for epoll_create1.  */
> >  #define EPOLL_CLOEXEC O_CLOEXEC
> > +#define EPOLL_NSTIMEO 0x1
> >
> >  /* Valid opcodes to issue to sys_epoll_ctl() */
> >  #define EPOLL_CTL_ADD 1
>
> Not a problem with your patch, but this concerns me.  O_CLOEXEC is
> defined differently for each architecture, so we need to stay out of
> several different bits when we define new flags for EPOLL_*.  Maybe
> this:
>
> /*
>  * Flags for epoll_create1.  O_CLOEXEC may be different bits, depending
>  * on the CPU architecture.  Reserve the known ones.
>  */
> #define EPOLL_CLOEXEC           O_CLOEXEC
> #define EPOLL_RESERVED_FLAGS    0x00680000
> #define EPOLL_NSTIMEO           0x00000001

Thanks. Good point, I'll add that in v3.
David Laight Nov. 16, 2020, 5:11 p.m. UTC | #4
From: Willem de Bruijn
> Sent: 16 November 2020 17:01
> 
> On Mon, Nov 16, 2020 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Mon, Nov 16, 2020 at 11:10:01AM -0500, Willem de Bruijn wrote:
> > > diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> > > index 8a3432d0f0dc..f6ef9c9f8ac2 100644
> > > --- a/include/uapi/linux/eventpoll.h
> > > +++ b/include/uapi/linux/eventpoll.h
> > > @@ -21,6 +21,7 @@
> > >
> > >  /* Flags for epoll_create1.  */
> > >  #define EPOLL_CLOEXEC O_CLOEXEC
> > > +#define EPOLL_NSTIMEO 0x1
> > >
> > >  /* Valid opcodes to issue to sys_epoll_ctl() */
> > >  #define EPOLL_CTL_ADD 1
> >
> > Not a problem with your patch, but this concerns me.  O_CLOEXEC is
> > defined differently for each architecture, so we need to stay out of
> > several different bits when we define new flags for EPOLL_*.  Maybe
> > this:
> >
> > /*
> >  * Flags for epoll_create1.  O_CLOEXEC may be different bits, depending
> >  * on the CPU architecture.  Reserve the known ones.
> >  */
> > #define EPOLL_CLOEXEC           O_CLOEXEC
> > #define EPOLL_RESERVED_FLAGS    0x00680000
> > #define EPOLL_NSTIMEO           0x00000001
> 
> Thanks. Good point, I'll add that in v3.

You could also add a compile assert to check that the flag is reserved.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Willem de Bruijn Nov. 16, 2020, 7:54 p.m. UTC | #5
On Mon, Nov 16, 2020 at 12:11 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Willem de Bruijn
> > Sent: 16 November 2020 17:01
> >
> > On Mon, Nov 16, 2020 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Mon, Nov 16, 2020 at 11:10:01AM -0500, Willem de Bruijn wrote:
> > > > diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> > > > index 8a3432d0f0dc..f6ef9c9f8ac2 100644
> > > > --- a/include/uapi/linux/eventpoll.h
> > > > +++ b/include/uapi/linux/eventpoll.h
> > > > @@ -21,6 +21,7 @@
> > > >
> > > >  /* Flags for epoll_create1.  */
> > > >  #define EPOLL_CLOEXEC O_CLOEXEC
> > > > +#define EPOLL_NSTIMEO 0x1
> > > >
> > > >  /* Valid opcodes to issue to sys_epoll_ctl() */
> > > >  #define EPOLL_CTL_ADD 1
> > >
> > > Not a problem with your patch, but this concerns me.  O_CLOEXEC is
> > > defined differently for each architecture, so we need to stay out of
> > > several different bits when we define new flags for EPOLL_*.  Maybe
> > > this:
> > >
> > > /*
> > >  * Flags for epoll_create1.  O_CLOEXEC may be different bits, depending
> > >  * on the CPU architecture.  Reserve the known ones.
> > >  */
> > > #define EPOLL_CLOEXEC           O_CLOEXEC
> > > #define EPOLL_RESERVED_FLAGS    0x00680000
> > > #define EPOLL_NSTIMEO           0x00000001
> >
> > Thanks. Good point, I'll add that in v3.
>
> You could also add a compile assert to check that the flag is reserved.

Like this?

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(EPOLL_NSTIMEO & EPOLL_RESERVED_FLAGS);
Matthew Wilcox Nov. 16, 2020, 8 p.m. UTC | #6
On Mon, Nov 16, 2020 at 02:54:17PM -0500, Willem de Bruijn wrote:
> > You could also add a compile assert to check that the flag is reserved.
> 
> Like this?
> 
>         /* Check the EPOLL_* constant for consistency.  */
>         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
> +        BUILD_BUG_ON(EPOLL_NSTIMEO & EPOLL_RESERVED_FLAGS);

i think you got the sense of that test wrong.  but yes.
Andrew Morton Nov. 16, 2020, 8:04 p.m. UTC | #7
On Mon, 16 Nov 2020 11:10:01 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> From: Willem de Bruijn <willemb@google.com>
> 
> Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> interpretation of argument timeout in epoll_wait from msec to nsec.
> 
> Use cases such as datacenter networking operate on timescales well
> below milliseconds. Shorter timeouts bounds their tail latency.
> The underlying hrtimer is already programmed with nsec resolution.

hm, maybe.  It's not very nice to be using one syscall to alter the
interpretation of another syscall's argument in this fashion.  For
example, one wonders how strace(1) is to properly interpret & display
this argument?

Did you consider adding epoll_wait2()/epoll_pwait2() syscalls which
take a nsec timeout?  Seems simpler.

Either way, we'd be interested in seeing the proposed manpage updates
alongside this change.

> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -225,6 +225,9 @@ struct eventpoll {
>  	unsigned int napi_id;
>  #endif
>  
> +	/* Accept timeout in ns resolution (EPOLL_NSTIMEO) */
> +	unsigned int nstimeout:1;
> +


Why a bitfield?  This invites other developers to add new bitfields to
the same word.  And if that happens, we'll need to consider the locking
rules for that word - I don't think the compiler provides any
protection against concurrent modifications to the bitfields which
share a machine word.  If we add a rule

/*
 * per eventpoll flags.  Initialized at creation time, never changes
 * thereafter
 */

then that would cover it.  Or just make the thing a `bool'?
Willem de Bruijn Nov. 16, 2020, 11:36 p.m. UTC | #8
On Mon, Nov 16, 2020 at 3:04 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 16 Nov 2020 11:10:01 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
>
> > From: Willem de Bruijn <willemb@google.com>
> >
> > Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> > interpretation of argument timeout in epoll_wait from msec to nsec.
> >
> > Use cases such as datacenter networking operate on timescales well
> > below milliseconds. Shorter timeouts bounds their tail latency.
> > The underlying hrtimer is already programmed with nsec resolution.
>
> hm, maybe.  It's not very nice to be using one syscall to alter the
> interpretation of another syscall's argument in this fashion.  For
> example, one wonders how strace(1) is to properly interpret & display
> this argument?
>
> Did you consider adding epoll_wait2()/epoll_pwait2() syscalls which
> take a nsec timeout?  Seems simpler.

I took a first stab. The patch does become quite a bit more complex.

I was not aware of how uncommon syscall argument interpretation
contingent on internal object state really is. Yes, that can
complicate inspection with strace, seccomp, ... This particular case
seems benign to me. But perhaps it sets a precedent.

A new nsec resolution epoll syscall would be analogous to pselect and
ppoll, both of which switched to nsec resolution timespec.

Since creating new syscalls is rare, add a flags argument at the same time?

Then I would split the change in two: (1) add the new syscall with
extra flags argument, (2) define flag EPOLL_WAIT_NSTIMEO to explicitly
change the time scale of the timeout argument. To avoid easy mistakes
by callers in absence of stronger typing.

epoll_wait is missing from include/uapi/asm-generic/unistd.h as it is
superseded by epoll_pwait. Following the same rationale, add
epoll_pwait2 (only).

> Either way, we'd be interested in seeing the proposed manpage updates
> alongside this change.

Will do. What is the best way? A separate RFC patch against
manpages/master sent at the same time?
Willem de Bruijn Nov. 16, 2020, 11:51 p.m. UTC | #9
On Mon, Nov 16, 2020 at 6:36 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Mon, Nov 16, 2020 at 3:04 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Mon, 16 Nov 2020 11:10:01 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> >
> > > From: Willem de Bruijn <willemb@google.com>
> > >
> > > Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> > > interpretation of argument timeout in epoll_wait from msec to nsec.
> > >
> > > Use cases such as datacenter networking operate on timescales well
> > > below milliseconds. Shorter timeouts bounds their tail latency.
> > > The underlying hrtimer is already programmed with nsec resolution.
> >
> > hm, maybe.  It's not very nice to be using one syscall to alter the
> > interpretation of another syscall's argument in this fashion.  For
> > example, one wonders how strace(1) is to properly interpret & display
> > this argument?
> >
> > Did you consider adding epoll_wait2()/epoll_pwait2() syscalls which
> > take a nsec timeout?  Seems simpler.
>
> I took a first stab. The patch does become quite a bit more complex.

Not complex in terms of timeout logic. Just a bigger patch, taking as
example the recent commit ecb8ac8b1f14 that added process_madvise.

> I was not aware of how uncommon syscall argument interpretation
> contingent on internal object state really is. Yes, that can
> complicate inspection with strace, seccomp, ... This particular case
> seems benign to me. But perhaps it sets a precedent.
>
> A new nsec resolution epoll syscall would be analogous to pselect and
> ppoll, both of which switched to nsec resolution timespec.
>
> Since creating new syscalls is rare, add a flags argument at the same time?
>
> Then I would split the change in two: (1) add the new syscall with
> extra flags argument, (2) define flag EPOLL_WAIT_NSTIMEO to explicitly
> change the time scale of the timeout argument. To avoid easy mistakes
> by callers in absence of stronger typing.

Come to think of it, better to convert to timespec to both have actual
typing and consistency with ppoll/pselect.

> epoll_wait is missing from include/uapi/asm-generic/unistd.h as it is
> superseded by epoll_pwait. Following the same rationale, add
> epoll_pwait2 (only).
Andrew Morton Nov. 17, 2020, 12:37 a.m. UTC | #10
On Mon, 16 Nov 2020 18:51:16 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> On Mon, Nov 16, 2020 at 6:36 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Mon, Nov 16, 2020 at 3:04 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Mon, 16 Nov 2020 11:10:01 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > > From: Willem de Bruijn <willemb@google.com>
> > > >
> > > > Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> > > > interpretation of argument timeout in epoll_wait from msec to nsec.
> > > >
> > > > Use cases such as datacenter networking operate on timescales well
> > > > below milliseconds. Shorter timeouts bounds their tail latency.
> > > > The underlying hrtimer is already programmed with nsec resolution.
> > >
> > > hm, maybe.  It's not very nice to be using one syscall to alter the
> > > interpretation of another syscall's argument in this fashion.  For
> > > example, one wonders how strace(1) is to properly interpret & display
> > > this argument?
> > >
> > > Did you consider adding epoll_wait2()/epoll_pwait2() syscalls which
> > > take a nsec timeout?  Seems simpler.
> >
> > I took a first stab. The patch does become quite a bit more complex.
> 
> Not complex in terms of timeout logic. Just a bigger patch, taking as
> example the recent commit ecb8ac8b1f14 that added process_madvise.

That's OK - it's mainly syscall table patchery.  The fs/ changes are
what matters.  And the interface.

> > I was not aware of how uncommon syscall argument interpretation
> > contingent on internal object state really is. Yes, that can
> > complicate inspection with strace, seccomp, ... This particular case
> > seems benign to me. But perhaps it sets a precedent.
> >
> > A new nsec resolution epoll syscall would be analogous to pselect and
> > ppoll, both of which switched to nsec resolution timespec.
> >
> > Since creating new syscalls is rare, add a flags argument at the same time?

Adding a syscall is pretty cheap - it's just a table entry.

> >
> > Then I would split the change in two: (1) add the new syscall with
> > extra flags argument, (2) define flag EPOLL_WAIT_NSTIMEO to explicitly
> > change the time scale of the timeout argument. To avoid easy mistakes
> > by callers in absence of stronger typing.

I don't understand this.  You're proposing that the new epoll_pwait2() be
able to take either msec or nsec, based on the flags argument?  With a
longer-term plan to deprecate the old epoll_pwait()?

If so, that's not likely to be viable - how can we ever know that the
whole world stopped using the old syscall?

> Come to think of it, better to convert to timespec to both have actual
> typing and consistency with ppoll/pselect.

Sure.

> > epoll_wait is missing from include/uapi/asm-generic/unistd.h as it is
> > superseded by epoll_pwait. Following the same rationale, add
> > epoll_pwait2 (only).

Sure.

> A separate RFC patch against manpages/master sent at the same time?

That's the common approach - a followup saying "here's what I'll send
to the manpages people if this gets merged".

And something under tools/testing/sefltests/ would be nice, if only so
that the various arch maintainers can verify that their new syscall is
working correctly.  Perhaps by adding a please-use-epoll_pwait2 arg to
the existing
tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c, if that
looks like a suitable testcase.
Willem de Bruijn Nov. 17, 2020, 2:21 a.m. UTC | #11
On Mon, Nov 16, 2020 at 7:37 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 16 Nov 2020 18:51:16 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
>
> > On Mon, Nov 16, 2020 at 6:36 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > On Mon, Nov 16, 2020 at 3:04 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > > >
> > > > On Mon, 16 Nov 2020 11:10:01 -0500 Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > > From: Willem de Bruijn <willemb@google.com>
> > > > >
> > > > > Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
> > > > > interpretation of argument timeout in epoll_wait from msec to nsec.
> > > > >
> > > > > Use cases such as datacenter networking operate on timescales well
> > > > > below milliseconds. Shorter timeouts bounds their tail latency.
> > > > > The underlying hrtimer is already programmed with nsec resolution.
> > > >
> > > > hm, maybe.  It's not very nice to be using one syscall to alter the
> > > > interpretation of another syscall's argument in this fashion.  For
> > > > example, one wonders how strace(1) is to properly interpret & display
> > > > this argument?
> > > >
> > > > Did you consider adding epoll_wait2()/epoll_pwait2() syscalls which
> > > > take a nsec timeout?  Seems simpler.
> > >
> > > I took a first stab. The patch does become quite a bit more complex.
> >
> > Not complex in terms of timeout logic. Just a bigger patch, taking as
> > example the recent commit ecb8ac8b1f14 that added process_madvise.
>
> That's OK - it's mainly syscall table patchery.  The fs/ changes are
> what matters.  And the interface.
>
> > > I was not aware of how uncommon syscall argument interpretation
> > > contingent on internal object state really is. Yes, that can
> > > complicate inspection with strace, seccomp, ... This particular case
> > > seems benign to me. But perhaps it sets a precedent.
> > >
> > > A new nsec resolution epoll syscall would be analogous to pselect and
> > > ppoll, both of which switched to nsec resolution timespec.
> > >
> > > Since creating new syscalls is rare, add a flags argument at the same time?
>
> Adding a syscall is pretty cheap - it's just a table entry.

Okay. Then I won't add a flags argument now.

> > >
> > > Then I would split the change in two: (1) add the new syscall with
> > > extra flags argument, (2) define flag EPOLL_WAIT_NSTIMEO to explicitly
> > > change the time scale of the timeout argument. To avoid easy mistakes
> > > by callers in absence of stronger typing.
>
> I don't understand this.  You're proposing that the new epoll_pwait2() be
> able to take either msec or nsec, based on the flags argument?

It wasn't elegant. Superseded by the below alternative to add a timespec.

> With a
> longer-term plan to deprecate the old epoll_pwait()?

> If so, that's not likely to be viable - how can we ever know that the
> whole world stopped using the old syscall?

I don't mean to deprecate it. I noticed that epoll_wait was removed
from asm-generic/unistd.h in favor of epoll_pwait on the argument that
this should list the minimally needed syscall set. Removed in commit
a0673fdbcd42 ("asm-generic: clean up asm/unistd.h"), a descriptive
comment was earlier added in commit e64a1617eca3 ("asm-generic: add a
generic unistd.h"). If the same argument still holds, when adding
epoll_pwait2 there, I should remove epoll_pwait. But I'm admittedly
not very familiar with the implications of touching this uapi file.
Will read up.

> > Come to think of it, better to convert to timespec to both have actual
> > typing and consistency with ppoll/pselect.
>
> Sure.
>
> > > epoll_wait is missing from include/uapi/asm-generic/unistd.h as it is
> > > superseded by epoll_pwait. Following the same rationale, add
> > > epoll_pwait2 (only).
>
> Sure.
>
> > A separate RFC patch against manpages/master sent at the same time?
>
> That's the common approach - a followup saying "here's what I'll send
> to the manpages people if this gets merged".
>
> And something under tools/testing/sefltests/ would be nice, if only so
> that the various arch maintainers can verify that their new syscall is
> working correctly.  Perhaps by adding a please-use-epoll_pwait2 arg to
> the existing
> tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c, if that
> looks like a suitable testcase.

Will do both. Thanks!
diff mbox series

Patch

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4df61129566d..817d9cc5b8b8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -225,6 +225,9 @@  struct eventpoll {
 	unsigned int napi_id;
 #endif
 
+	/* Accept timeout in ns resolution (EPOLL_NSTIMEO) */
+	unsigned int nstimeout:1;
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/* tracks wakeup nests for lockdep validation */
 	u8 nests;
@@ -1787,17 +1790,20 @@  static int ep_send_events(struct eventpoll *ep,
 	return esed.res;
 }
 
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_nstimeout(s64 ns)
 {
-	struct timespec64 now, ts = {
-		.tv_sec = ms / MSEC_PER_SEC,
-		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
-	};
+	struct timespec64 now, ts;
 
+	ts = ns_to_timespec64(ns);
 	ktime_get_ts64(&now);
 	return timespec64_add_safe(now, ts);
 }
 
+static inline struct timespec64 ep_set_mstimeout(long ms)
+{
+	return ep_set_nstimeout(ms * (s64)NSEC_PER_MSEC);
+}
+
 /**
  * ep_poll - Retrieves ready events, and delivers them to the caller supplied
  *           event buffer.
@@ -1826,7 +1832,10 @@  static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	lockdep_assert_irqs_enabled();
 
 	if (timeout > 0) {
-		struct timespec64 end_time = ep_set_mstimeout(timeout);
+		struct timespec64 end_time;
+
+		end_time = ep->nstimeout ? ep_set_nstimeout(timeout) :
+					   ep_set_mstimeout(timeout);
 
 		slack = select_estimate_accuracy(&end_time);
 		to = &expires;
@@ -2046,7 +2055,7 @@  static int do_epoll_create(int flags)
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
-	if (flags & ~EPOLL_CLOEXEC)
+	if (flags & ~(EPOLL_CLOEXEC | EPOLL_NSTIMEO))
 		return -EINVAL;
 	/*
 	 * Create the internal data structure ("struct eventpoll").
@@ -2054,6 +2063,9 @@  static int do_epoll_create(int flags)
 	error = ep_alloc(&ep);
 	if (error < 0)
 		return error;
+
+	ep->nstimeout = !!(flags & EPOLL_NSTIMEO);
+
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..f6ef9c9f8ac2 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,7 @@ 
 
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
+#define EPOLL_NSTIMEO 0x1
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1