diff mbox series

[V1] fuse: give wakeup hints to the scheduler

Message ID 1638780405-38026-1-git-send-email-quic_pragalla@quicinc.com (mailing list archive)
State New, archived
Headers show
Series [V1] fuse: give wakeup hints to the scheduler | expand

Commit Message

Pradeep Pragallapati Dec. 6, 2021, 8:46 a.m. UTC
The synchronous wakeup interface is available only for the
interruptible wakeup. Add it for normal wakeup and use this
synchronous wakeup interface to wakeup the userspace daemon.
Scheduler can make use of this hint to find a better CPU for
the waker task.

With this change the performance numbers for compress, decompress
and copy use-cases on /sdcard path has improved by ~30%.

Use-case details:
1. copy 10000 files of each 4k size into /sdcard path
2. use any File explorer application that has compress/decompress
support
3. start compress/decompress and capture the time.

-------------------------------------------------
| Default   | wakeup support | Improvement/Diff |
-------------------------------------------------
| 13.8 sec  | 9.9 sec        | 3.9 sec (28.26%) |
-------------------------------------------------

Co-developed-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Pradeep P V K <quic_pragalla@quicinc.com>
---
 fs/fuse/dev.c        | 22 +++++++++++++---------
 fs/fuse/fuse_i.h     |  6 +++---
 fs/fuse/virtio_fs.c  |  8 +++++---
 include/linux/wait.h |  1 +
 4 files changed, 22 insertions(+), 15 deletions(-)

Comments

Miklos Szeredi Dec. 7, 2021, 9:07 a.m. UTC | #1
On Mon, 6 Dec 2021 at 09:47, Pradeep P V K <quic_pragalla@quicinc.com> wrote:
>
> The synchronous wakeup interface is available only for the
> interruptible wakeup. Add it for normal wakeup and use this
> synchronous wakeup interface to wakeup the userspace daemon.
> Scheduler can make use of this hint to find a better CPU for
> the waker task.

Ingo, Peter,

What exactly does WF_SYNC do?   Does it try to give the waker's CPU
immediately to the waked?

If that doesn't work (e.g. in this patch the wake up is done with a
spin lock held) does it do anything?

Does it give a hint that the waked task should be scheduled on this
CPU at the next scheduling point?

Thanks,
Miklos


>
> With this change the performance numbers for compress, decompress
> and copy use-cases on /sdcard path has improved by ~30%.
>
> Use-case details:
> 1. copy 10000 files of each 4k size into /sdcard path
> 2. use any File explorer application that has compress/decompress
> support
> 3. start compress/decompress and capture the time.
>
> -------------------------------------------------
> | Default   | wakeup support | Improvement/Diff |
> -------------------------------------------------
> | 13.8 sec  | 9.9 sec        | 3.9 sec (28.26%) |
> -------------------------------------------------
>
> Co-developed-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
> Signed-off-by: Pradeep P V K <quic_pragalla@quicinc.com>
> ---
>  fs/fuse/dev.c        | 22 +++++++++++++---------
>  fs/fuse/fuse_i.h     |  6 +++---
>  fs/fuse/virtio_fs.c  |  8 +++++---
>  include/linux/wait.h |  1 +
>  4 files changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index cd54a52..fef2e23 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -207,10 +207,13 @@ static unsigned int fuse_req_hash(u64 unique)
>  /**
>   * A new request is available, wake fiq->waitq
>   */
> -static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
> +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq, bool sync)
>  __releases(fiq->lock)
>  {
> -       wake_up(&fiq->waitq);
> +       if (sync)
> +               wake_up_sync(&fiq->waitq);
> +       else
> +               wake_up(&fiq->waitq);
>         kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
>         spin_unlock(&fiq->lock);
>  }
> @@ -223,14 +226,15 @@ const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
>  EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
>
>  static void queue_request_and_unlock(struct fuse_iqueue *fiq,
> -                                    struct fuse_req *req)
> +                                    struct fuse_req *req,
> +                                    bool sync)
>  __releases(fiq->lock)
>  {
>         req->in.h.len = sizeof(struct fuse_in_header) +
>                 fuse_len_args(req->args->in_numargs,
>                               (struct fuse_arg *) req->args->in_args);
>         list_add_tail(&req->list, &fiq->pending);
> -       fiq->ops->wake_pending_and_unlock(fiq);
> +       fiq->ops->wake_pending_and_unlock(fiq, sync);
>  }
>
>  void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
> @@ -245,7 +249,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
>         if (fiq->connected) {
>                 fiq->forget_list_tail->next = forget;
>                 fiq->forget_list_tail = forget;
> -               fiq->ops->wake_forget_and_unlock(fiq);
> +               fiq->ops->wake_forget_and_unlock(fiq, 0);
>         } else {
>                 kfree(forget);
>                 spin_unlock(&fiq->lock);
> @@ -265,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
>                 fc->active_background++;
>                 spin_lock(&fiq->lock);
>                 req->in.h.unique = fuse_get_unique(fiq);
> -               queue_request_and_unlock(fiq, req);
> +               queue_request_and_unlock(fiq, req, 0);
>         }
>  }
>
> @@ -358,7 +362,7 @@ static int queue_interrupt(struct fuse_req *req)
>                         spin_unlock(&fiq->lock);
>                         return 0;
>                 }
> -               fiq->ops->wake_interrupt_and_unlock(fiq);
> +               fiq->ops->wake_interrupt_and_unlock(fiq, 0);
>         } else {
>                 spin_unlock(&fiq->lock);
>         }
> @@ -425,7 +429,7 @@ static void __fuse_request_send(struct fuse_req *req)
>                 /* acquire extra reference, since request is still needed
>                    after fuse_request_end() */
>                 __fuse_get_request(req);
> -               queue_request_and_unlock(fiq, req);
> +               queue_request_and_unlock(fiq, req, 1);
>
>                 request_wait_answer(req);
>                 /* Pairs with smp_wmb() in fuse_request_end() */
> @@ -600,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
>
>         spin_lock(&fiq->lock);
>         if (fiq->connected) {
> -               queue_request_and_unlock(fiq, req);
> +               queue_request_and_unlock(fiq, req, 0);
>         } else {
>                 err = -ENODEV;
>                 spin_unlock(&fiq->lock);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index c1a8b31..363e0ba 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -389,19 +389,19 @@ struct fuse_iqueue_ops {
>         /**
>          * Signal that a forget has been queued
>          */
> -       void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
> +       void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>                 __releases(fiq->lock);
>
>         /**
>          * Signal that an INTERRUPT request has been queued
>          */
> -       void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
> +       void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>                 __releases(fiq->lock);
>
>         /**
>          * Signal that a request has been queued
>          */
> -       void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
> +       void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>                 __releases(fiq->lock);
>
>         /**
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 4cfa4bc..bdc3dcc 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -972,7 +972,7 @@ static struct virtio_driver virtio_fs_driver = {
>  #endif
>  };
>
> -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq, bool sync)
>  __releases(fiq->lock)
>  {
>         struct fuse_forget_link *link;
> @@ -1007,7 +1007,8 @@ __releases(fiq->lock)
>         kfree(link);
>  }
>
> -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq,
> +                                               bool sync)
>  __releases(fiq->lock)
>  {
>         /*
> @@ -1222,7 +1223,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>         return ret;
>  }
>
> -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq,
> +                                       bool sync)
>  __releases(fiq->lock)
>  {
>         unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> diff --git a/include/linux/wait.h b/include/linux/wait.h
> index 2d0df57..690572ee 100644
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -228,6 +228,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
>  #define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
>  #define wake_up_interruptible_all(x)   __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
>  #define wake_up_interruptible_sync(x)  __wake_up_sync((x), TASK_INTERRUPTIBLE)
> +#define wake_up_sync(x)                        __wake_up_sync(x, TASK_NORMAL)
>
>  /*
>   * Wakeup macros to be used to report events to the targets.
> --
> 2.7.4
>
Peter Zijlstra Dec. 7, 2021, 10:07 a.m. UTC | #2
On Tue, Dec 07, 2021 at 10:07:45AM +0100, Miklos Szeredi wrote:
> On Mon, 6 Dec 2021 at 09:47, Pradeep P V K <quic_pragalla@quicinc.com> wrote:
> >
> > The synchronous wakeup interface is available only for the
> > interruptible wakeup. Add it for normal wakeup and use this
> > synchronous wakeup interface to wakeup the userspace daemon.
> > Scheduler can make use of this hint to find a better CPU for
> > the waker task.

That's a horrendoubly bad changelog :-/ Also, if you need it for
UNINTERRUPTIBLE that's trivial to do ofc.

> Ingo, Peter,
> 
> What exactly does WF_SYNC do?   Does it try to give the waker's CPU
> immediately to the waked?
> 
> If that doesn't work (e.g. in this patch the wake up is done with a
> spin lock held) does it do anything?
> 
> Does it give a hint that the waked task should be scheduled on this
> CPU at the next scheduling point?

WF_SYNC is a hint to the scheduler that the waker is about to go sleep
and as such it is reasonable to stack the woken thread on this CPU
instead of going to find an idle CPU for it.

Typically it also means that the waker and wakee share data, and thus
having them share the CPU is beneficial for cache locality.

That said, WF_SYNC is 'difficult' since not all users of the hint keep
their promise, so there's a bit of heuristics sprinkled on :/
Miklos Szeredi Dec. 7, 2021, 10:20 a.m. UTC | #3
On Tue, 7 Dec 2021 at 11:07, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Dec 07, 2021 at 10:07:45AM +0100, Miklos Szeredi wrote:
> > On Mon, 6 Dec 2021 at 09:47, Pradeep P V K <quic_pragalla@quicinc.com> wrote:
> > >
> > > The synchronous wakeup interface is available only for the
> > > interruptible wakeup. Add it for normal wakeup and use this
> > > synchronous wakeup interface to wakeup the userspace daemon.
> > > Scheduler can make use of this hint to find a better CPU for
> > > the waker task.
>
> That's a horrendoubly bad changelog :-/ Also, if you need it for
> UNINTERRUPTIBLE that's trivial to do ofc.
>
> > Ingo, Peter,
> >
> > What exactly does WF_SYNC do?   Does it try to give the waker's CPU
> > immediately to the waked?
> >
> > If that doesn't work (e.g. in this patch the wake up is done with a
> > spin lock held) does it do anything?
> >
> > Does it give a hint that the waked task should be scheduled on this
> > CPU at the next scheduling point?
>
> WF_SYNC is a hint to the scheduler that the waker is about to go sleep
> and as such it is reasonable to stack the woken thread on this CPU
> instead of going to find an idle CPU for it.
>
> Typically it also means that the waker and wakee share data, and thus
> having them share the CPU is beneficial for cache locality.

Okay, so it doesn't give up the CPU immediately to the woken task,
just marks the woken task as a "successor" when the current task goes
to sleep.  Right?

That may be good for fuse as the data is indeed shared.  It would be
even better if the woken task had already a known affinity to this
CPU, since that would mean thread local data wouldn't have to be
migrated each time...   So I'm not sure sync wakeup alone is worth it,
needs real life benchmarking.

Thanks,
Miklos
Peter Zijlstra Dec. 7, 2021, 10:41 a.m. UTC | #4
On Tue, Dec 07, 2021 at 11:20:59AM +0100, Miklos Szeredi wrote:
> On Tue, 7 Dec 2021 at 11:07, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Dec 07, 2021 at 10:07:45AM +0100, Miklos Szeredi wrote:
> > > On Mon, 6 Dec 2021 at 09:47, Pradeep P V K <quic_pragalla@quicinc.com> wrote:
> > > >
> > > > The synchronous wakeup interface is available only for the
> > > > interruptible wakeup. Add it for normal wakeup and use this
> > > > synchronous wakeup interface to wakeup the userspace daemon.
> > > > Scheduler can make use of this hint to find a better CPU for
> > > > the waker task.
> >
> > That's a horrendoubly bad changelog :-/ Also, if you need it for
> > UNINTERRUPTIBLE that's trivial to do ofc.
> >
> > > Ingo, Peter,
> > >
> > > What exactly does WF_SYNC do?   Does it try to give the waker's CPU
> > > immediately to the waked?
> > >
> > > If that doesn't work (e.g. in this patch the wake up is done with a
> > > spin lock held) does it do anything?
> > >
> > > Does it give a hint that the waked task should be scheduled on this
> > > CPU at the next scheduling point?
> >
> > WF_SYNC is a hint to the scheduler that the waker is about to go sleep
> > and as such it is reasonable to stack the woken thread on this CPU
> > instead of going to find an idle CPU for it.
> >
> > Typically it also means that the waker and wakee share data, and thus
> > having them share the CPU is beneficial for cache locality.
> 
> Okay, so it doesn't give up the CPU immediately to the woken task,
> just marks the woken task as a "successor" when the current task goes
> to sleep.  Right?

More or less.

> That may be good for fuse as the data is indeed shared.  It would be
> even better if the woken task had already a known affinity to this
> CPU, since that would mean thread local data wouldn't have to be
> migrated each time...   So I'm not sure sync wakeup alone is worth it,
> needs real life benchmarking.

Hard affinities have other down-sides.. occasional migrations shouldn't
be a problem, constant migrations are bad.
Miklos Szeredi Dec. 7, 2021, 12:44 p.m. UTC | #5
On Tue, 7 Dec 2021 at 11:42, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Dec 07, 2021 at 11:20:59AM +0100, Miklos Szeredi wrote:

> > That may be good for fuse as the data is indeed shared.  It would be
> > even better if the woken task had already a known affinity to this
> > CPU, since that would mean thread local data wouldn't have to be
> > migrated each time...   So I'm not sure sync wakeup alone is worth it,
> > needs real life benchmarking.
>
> Hard affinities have other down-sides.. occasional migrations shouldn't
> be a problem, constant migrations are bad.

Currently fuse readers are sleeping in
wait_event_interruptible_exclusive().  wake_up_interruptible_sync()
will pick the first thread in the queue and wake that up.  That will
likely result in migration, right?

What would be much nicer, is to look at all the threads on the waitq
and pick one that previously ran on the current CPU if there's one.
Could this be implemented?

Thanks,
Miklos
Peter Zijlstra Dec. 7, 2021, 1:45 p.m. UTC | #6
On Tue, Dec 07, 2021 at 01:44:49PM +0100, Miklos Szeredi wrote:
> On Tue, 7 Dec 2021 at 11:42, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Dec 07, 2021 at 11:20:59AM +0100, Miklos Szeredi wrote:
> 
> > > That may be good for fuse as the data is indeed shared.  It would be
> > > even better if the woken task had already a known affinity to this
> > > CPU, since that would mean thread local data wouldn't have to be
> > > migrated each time...   So I'm not sure sync wakeup alone is worth it,
> > > needs real life benchmarking.
> >
> > Hard affinities have other down-sides.. occasional migrations shouldn't
> > be a problem, constant migrations are bad.
> 
> Currently fuse readers are sleeping in
> wait_event_interruptible_exclusive().  wake_up_interruptible_sync()
> will pick the first thread in the queue and wake that up.  That will
> likely result in migration, right?

Per the _sync it will try harder to place the thread on the waking CPU.
Which is what you want right? For the worker to do the work on the CPU
that wakes it, since that CPU has the data cache-hot etc..

> What would be much nicer, is to look at all the threads on the waitq
> and pick one that previously ran on the current CPU if there's one.
> Could this be implemented?

It would violate the FIFO semantics of _exclusive.
Peter Zijlstra Dec. 7, 2021, 1:51 p.m. UTC | #7
On Tue, Dec 07, 2021 at 02:45:49PM +0100, Peter Zijlstra wrote:

> > What would be much nicer, is to look at all the threads on the waitq
> > and pick one that previously ran on the current CPU if there's one.
> > Could this be implemented?
> 
> It would violate the FIFO semantics of _exclusive.

That said, look at
kernel/locking/percpu-rwsem.c:percpu_rwsem_wake_function() for how to do
really terrible things with waitqueues, possibly including what you
suggest.
Miklos Szeredi Dec. 7, 2021, 2:03 p.m. UTC | #8
On Tue, 7 Dec 2021 at 14:51, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Dec 07, 2021 at 02:45:49PM +0100, Peter Zijlstra wrote:
>
> > > What would be much nicer, is to look at all the threads on the waitq
> > > and pick one that previously ran on the current CPU if there's one.
> > > Could this be implemented?
> >
> > It would violate the FIFO semantics of _exclusive.
>
> That said, look at
> kernel/locking/percpu-rwsem.c:percpu_rwsem_wake_function() for how to do
> really terrible things with waitqueues, possibly including what you
> suggest.

Okay, so it looks doable, but rather more involved than just sticking
that _sync onto the wake helper.

FIFO is used so that we always wake the most recently used thread, right?

That makes sense if it doesn't involve migration, but if the hot
thread is going to be moved to another CPU then we'd lost most of the
advantages.  Am I missing something?

Thanks,
Miklos
Peter Zijlstra Dec. 7, 2021, 2:25 p.m. UTC | #9
On Tue, Dec 07, 2021 at 03:03:01PM +0100, Miklos Szeredi wrote:
> On Tue, 7 Dec 2021 at 14:51, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Dec 07, 2021 at 02:45:49PM +0100, Peter Zijlstra wrote:
> >
> > > > What would be much nicer, is to look at all the threads on the waitq
> > > > and pick one that previously ran on the current CPU if there's one.
> > > > Could this be implemented?
> > >
> > > It would violate the FIFO semantics of _exclusive.
> >
> > That said, look at
> > kernel/locking/percpu-rwsem.c:percpu_rwsem_wake_function() for how to do
> > really terrible things with waitqueues, possibly including what you
> > suggest.
> 
> Okay, so it looks doable, but rather more involved than just sticking
> that _sync onto the wake helper.
> 
> FIFO is used so that we always wake the most recently used thread, right?
> 
> That makes sense if it doesn't involve migration, but if the hot
> thread is going to be moved to another CPU then we'd lost most of the
> advantages.  Am I missing something?

FIFO means the thread used longest ago gets to go first. If your threads
are an idempotent workers, FIFO might not be the best option. But I'm
not much familiar with the FUSE code or it's design.
Miklos Szeredi Dec. 8, 2021, 3:46 p.m. UTC | #10
On Tue, 7 Dec 2021 at 15:25, Peter Zijlstra <peterz@infradead.org> wrote:

> FIFO means the thread used longest ago gets to go first. If your threads
> are an idempotent workers, FIFO might not be the best option. But I'm
> not much familiar with the FUSE code or it's design.

Okay.  Did some experiments, but couldn't see
wake_up_interruptible_sync() actually migrate the woken task, the
behavior was identical to wake_up_interruptible().   I guess this is
the "less" part in "more or less", but it would be good to see more
clearly what is happening.

I'll try to describe the design to give more context:

- FUSE is similar to network filesystem in that there's a server and a
client, except both are on the same host. The client lives in the
kernel and the server lives in userspace.

- Communication between them is done with read and write syscalls.

- Usually the server has multiple threads.  When a server thread is
idle it is blocking in sys_read -> ... -> fuse_dev_do_read ->
wait_event_interruptible_exclusive(fiq->waitq,...).

- When a filesystem request comes in (e.g. mkdir) a request is
constructed, put on the input queue (fiq->pending) and fiq->waitq
woken up.  After this the client task goes to sleep in
request_wait_answer -> wait_event_interruptible(req->waitq, ...).

- The server thread takes the request off the pending list, copies the
data to the userspace buffer and puts the request on the processing
list.

- The userspace part interprets the read buffer, performs the fs
operation, and writes the reply.

- During the write(2) the reply is now copied to the kernel and the
request is looked up on the processing list.  The client is woken up
through req->waitq.  After returning from write(2) the server thread
again calls read(2) to get the next request.

- After being woken up, the client task now returns with the result of
the operation.

- The above example is for synchronous requests.  There are async
requests like readahead or buffered writes.  In that case the client
does not call request_wait_answer() but returns immediately and the
result is processed from the server thread using a callback function
(req->args->end()).

From a scheduling prospective it would be ideal if the server thread's
CPU was matched to the client thread's CPU, since that would make the
data stay local, and for synchronous requests a _sync type wakeup is
perfect, since the client goes to sleep just as the server starts
processing and vice versa.

Always migrating the woken server thread to the client's CPU is not
going to be good, since this would result in too many migrations and
would loose locality for the server's stack.

Another idea is to add per-cpu input queues.  The client then would
queue the request on the pending queue corresponding to its CPU and
wake up the server thread blocked on that queue.

What happens though if this particular queue has no servers?  Or if a
queue is starved because it's served by less threads than another?
Handing these cases seems really complicated.

Is there a simper way?

Thanks,
Miklos
Shachar Sharon Dec. 9, 2021, 1:23 p.m. UTC | #11
On Wed, Dec 08, 2021 at 04:46:06PM +0100, Miklos Szeredi wrote:
>On Tue, 7 Dec 2021 at 15:25, Peter Zijlstra <peterz@infradead.org> wrote:
>
>> FIFO means the thread used longest ago gets to go first. If your threads
>> are an idempotent workers, FIFO might not be the best option. But I'm
>> not much familiar with the FUSE code or it's design.
>
>Okay.  Did some experiments, but couldn't see
>wake_up_interruptible_sync() actually migrate the woken task, the
>behavior was identical to wake_up_interruptible().   I guess this is
>the "less" part in "more or less", but it would be good to see more
>clearly what is happening.
>
>I'll try to describe the design to give more context:
>
>- FUSE is similar to network filesystem in that there's a server and a
>client, except both are on the same host. The client lives in the
>kernel and the server lives in userspace.
>
>- Communication between them is done with read and write syscalls.
>
>- Usually the server has multiple threads.  When a server thread is
>idle it is blocking in sys_read -> ... -> fuse_dev_do_read ->
>wait_event_interruptible_exclusive(fiq->waitq,...).
>
>- When a filesystem request comes in (e.g. mkdir) a request is
>constructed, put on the input queue (fiq->pending) and fiq->waitq
>woken up.  After this the client task goes to sleep in
>request_wait_answer -> wait_event_interruptible(req->waitq, ...).
>
>- The server thread takes the request off the pending list, copies the
>data to the userspace buffer and puts the request on the processing
>list.
>
>- The userspace part interprets the read buffer, performs the fs
>operation, and writes the reply.
>
>- During the write(2) the reply is now copied to the kernel and the
>request is looked up on the processing list.  The client is woken up
>through req->waitq.  After returning from write(2) the server thread
>again calls read(2) to get the next request.
>
>- After being woken up, the client task now returns with the result of
>the operation.
>
>- The above example is for synchronous requests.  There are async
>requests like readahead or buffered writes.  In that case the client
>does not call request_wait_answer() but returns immediately and the
>result is processed from the server thread using a callback function
>(req->args->end()).
>
>From a scheduling prospective it would be ideal if the server thread's
>CPU was matched to the client thread's CPU, since that would make the
>data stay local, and for synchronous requests a _sync type wakeup is
>perfect, since the client goes to sleep just as the server starts
>processing and vice versa.
>
>Always migrating the woken server thread to the client's CPU is not
>going to be good, since this would result in too many migrations and
>would loose locality for the server's stack.
>
>Another idea is to add per-cpu input queues.  The client then would
>queue the request on the pending queue corresponding to its CPU and
>wake up the server thread blocked on that queue.
>
>What happens though if this particular queue has no servers?  Or if a
>queue is starved because it's served by less threads than another?
>Handing these cases seems really complicated.
>
Per-CPU input queue is a great idea. It was a key concept in ZUFS[1]
design. However, describing it as complicated would be an understatement,
to say the least.

>Is there a simper way?
Here is an idea: hint the userspace server on which cpu to execute the
_next_ request via the (unused) fuse_in_header.padding field in current
request. Naturally, this method would be effective only for cases where
queue-size > 1.

>
>Thanks,
>Miklos

[1] https://lwn.net/Articles/795996/
qixiaoyu1 Dec. 22, 2022, 9:34 a.m. UTC | #12
Hi Pradeep, Miklos,

We have found this patch benefits the decompress performance up to ~46%
on our Android smart devices.

Use-case details:
1. prepare a zip file with 2000 pictures into /sdcard path
2. use unzip command to decompress the zip file

-------------------------------------------------
| Default   | patch          | Improvement/Diff |
-------------------------------------------------
| 13 sec    | 7 sec          | 6 sec (46%)      |
-------------------------------------------------

On Android platform, the server thread runs in MediaProvider as a
background service, and it may runs on little cores. However, applications
usually run on big cores, for they are forground. With this patch, the
server threads can run much faster and benefit the decompress performance.

------------------------------------------------------------------------
| Total Wall duration of server threads (ms)                           |
------------------------------------------------------------------------
|           | Wall duration  | Occurrences   | Occurrences on big core |
------------------------------------------------------------------------
| Default   | 3583  (ms)     | 13926         | 5%                      |
------------------------------------------------------------------------
| patch     | 1276  (ms)     | 12996         | 79%                     |
------------------------------------------------------------------------

Thanks,
Xiaoyu

On Mon, Dec 06, 2021 at 02:16:45PM +0530, Pradeep P V K wrote:
> The synchronous wakeup interface is available only for the
> interruptible wakeup. Add it for normal wakeup and use this
> synchronous wakeup interface to wakeup the userspace daemon.
> Scheduler can make use of this hint to find a better CPU for
> the waker task.
> 
> With this change the performance numbers for compress, decompress
> and copy use-cases on /sdcard path has improved by ~30%.
> 
> Use-case details:
> 1. copy 10000 files of each 4k size into /sdcard path
> 2. use any File explorer application that has compress/decompress
> support
> 3. start compress/decompress and capture the time.
> 
> -------------------------------------------------
> | Default   | wakeup support | Improvement/Diff |
> -------------------------------------------------
> | 13.8 sec  | 9.9 sec        | 3.9 sec (28.26%) |
> -------------------------------------------------
> 
> Co-developed-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
> Signed-off-by: Pradeep P V K <quic_pragalla@quicinc.com>
> ---
>  fs/fuse/dev.c        | 22 +++++++++++++---------
>  fs/fuse/fuse_i.h     |  6 +++---
>  fs/fuse/virtio_fs.c  |  8 +++++---
>  include/linux/wait.h |  1 +
>  4 files changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index cd54a52..fef2e23 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -207,10 +207,13 @@ static unsigned int fuse_req_hash(u64 unique)
>  /**
>   * A new request is available, wake fiq->waitq
>   */
> -static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
> +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq, bool sync)
>  __releases(fiq->lock)
>  {
> -	wake_up(&fiq->waitq);
> +	if (sync)
> +		wake_up_sync(&fiq->waitq);
> +	else
> +		wake_up(&fiq->waitq);
>  	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
>  	spin_unlock(&fiq->lock);
>  }
> @@ -223,14 +226,15 @@ const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
>  EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
>  
>  static void queue_request_and_unlock(struct fuse_iqueue *fiq,
> -				     struct fuse_req *req)
> +				     struct fuse_req *req,
> +				     bool sync)
>  __releases(fiq->lock)
>  {
>  	req->in.h.len = sizeof(struct fuse_in_header) +
>  		fuse_len_args(req->args->in_numargs,
>  			      (struct fuse_arg *) req->args->in_args);
>  	list_add_tail(&req->list, &fiq->pending);
> -	fiq->ops->wake_pending_and_unlock(fiq);
> +	fiq->ops->wake_pending_and_unlock(fiq, sync);
>  }
>  
>  void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
> @@ -245,7 +249,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
>  	if (fiq->connected) {
>  		fiq->forget_list_tail->next = forget;
>  		fiq->forget_list_tail = forget;
> -		fiq->ops->wake_forget_and_unlock(fiq);
> +		fiq->ops->wake_forget_and_unlock(fiq, 0);
>  	} else {
>  		kfree(forget);
>  		spin_unlock(&fiq->lock);
> @@ -265,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
>  		fc->active_background++;
>  		spin_lock(&fiq->lock);
>  		req->in.h.unique = fuse_get_unique(fiq);
> -		queue_request_and_unlock(fiq, req);
> +		queue_request_and_unlock(fiq, req, 0);
>  	}
>  }
>  
> @@ -358,7 +362,7 @@ static int queue_interrupt(struct fuse_req *req)
>  			spin_unlock(&fiq->lock);
>  			return 0;
>  		}
> -		fiq->ops->wake_interrupt_and_unlock(fiq);
> +		fiq->ops->wake_interrupt_and_unlock(fiq, 0);
>  	} else {
>  		spin_unlock(&fiq->lock);
>  	}
> @@ -425,7 +429,7 @@ static void __fuse_request_send(struct fuse_req *req)
>  		/* acquire extra reference, since request is still needed
>  		   after fuse_request_end() */
>  		__fuse_get_request(req);
> -		queue_request_and_unlock(fiq, req);
> +		queue_request_and_unlock(fiq, req, 1);
>  
>  		request_wait_answer(req);
>  		/* Pairs with smp_wmb() in fuse_request_end() */
> @@ -600,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
>  
>  	spin_lock(&fiq->lock);
>  	if (fiq->connected) {
> -		queue_request_and_unlock(fiq, req);
> +		queue_request_and_unlock(fiq, req, 0);
>  	} else {
>  		err = -ENODEV;
>  		spin_unlock(&fiq->lock);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index c1a8b31..363e0ba 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -389,19 +389,19 @@ struct fuse_iqueue_ops {
>  	/**
>  	 * Signal that a forget has been queued
>  	 */
> -	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
> +	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>  		__releases(fiq->lock);
>  
>  	/**
>  	 * Signal that an INTERRUPT request has been queued
>  	 */
> -	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
> +	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>  		__releases(fiq->lock);
>  
>  	/**
>  	 * Signal that a request has been queued
>  	 */
> -	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
> +	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq, bool sync)
>  		__releases(fiq->lock);
>  
>  	/**
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 4cfa4bc..bdc3dcc 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -972,7 +972,7 @@ static struct virtio_driver virtio_fs_driver = {
>  #endif
>  };
>  
> -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq, bool sync)
>  __releases(fiq->lock)
>  {
>  	struct fuse_forget_link *link;
> @@ -1007,7 +1007,8 @@ __releases(fiq->lock)
>  	kfree(link);
>  }
>  
> -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq,
> +						bool sync)
>  __releases(fiq->lock)
>  {
>  	/*
> @@ -1222,7 +1223,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>  	return ret;
>  }
>  
> -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq,
> +					bool sync)
>  __releases(fiq->lock)
>  {
>  	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> diff --git a/include/linux/wait.h b/include/linux/wait.h
> index 2d0df57..690572ee 100644
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -228,6 +228,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
>  #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
>  #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
>  #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE)
> +#define wake_up_sync(x)			__wake_up_sync(x, TASK_NORMAL)
>  
>  /*
>   * Wakeup macros to be used to report events to the targets.
> -- 
> 2.7.4
>
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cd54a52..fef2e23 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -207,10 +207,13 @@  static unsigned int fuse_req_hash(u64 unique)
 /**
  * A new request is available, wake fiq->waitq
  */
-static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq, bool sync)
 __releases(fiq->lock)
 {
-	wake_up(&fiq->waitq);
+	if (sync)
+		wake_up_sync(&fiq->waitq);
+	else
+		wake_up(&fiq->waitq);
 	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 	spin_unlock(&fiq->lock);
 }
@@ -223,14 +226,15 @@  const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
 EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
 
 static void queue_request_and_unlock(struct fuse_iqueue *fiq,
-				     struct fuse_req *req)
+				     struct fuse_req *req,
+				     bool sync)
 __releases(fiq->lock)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		fuse_len_args(req->args->in_numargs,
 			      (struct fuse_arg *) req->args->in_args);
 	list_add_tail(&req->list, &fiq->pending);
-	fiq->ops->wake_pending_and_unlock(fiq);
+	fiq->ops->wake_pending_and_unlock(fiq, sync);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -245,7 +249,7 @@  void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 	if (fiq->connected) {
 		fiq->forget_list_tail->next = forget;
 		fiq->forget_list_tail = forget;
-		fiq->ops->wake_forget_and_unlock(fiq);
+		fiq->ops->wake_forget_and_unlock(fiq, 0);
 	} else {
 		kfree(forget);
 		spin_unlock(&fiq->lock);
@@ -265,7 +269,7 @@  static void flush_bg_queue(struct fuse_conn *fc)
 		fc->active_background++;
 		spin_lock(&fiq->lock);
 		req->in.h.unique = fuse_get_unique(fiq);
-		queue_request_and_unlock(fiq, req);
+		queue_request_and_unlock(fiq, req, 0);
 	}
 }
 
@@ -358,7 +362,7 @@  static int queue_interrupt(struct fuse_req *req)
 			spin_unlock(&fiq->lock);
 			return 0;
 		}
-		fiq->ops->wake_interrupt_and_unlock(fiq);
+		fiq->ops->wake_interrupt_and_unlock(fiq, 0);
 	} else {
 		spin_unlock(&fiq->lock);
 	}
@@ -425,7 +429,7 @@  static void __fuse_request_send(struct fuse_req *req)
 		/* acquire extra reference, since request is still needed
 		   after fuse_request_end() */
 		__fuse_get_request(req);
-		queue_request_and_unlock(fiq, req);
+		queue_request_and_unlock(fiq, req, 1);
 
 		request_wait_answer(req);
 		/* Pairs with smp_wmb() in fuse_request_end() */
@@ -600,7 +604,7 @@  static int fuse_simple_notify_reply(struct fuse_mount *fm,
 
 	spin_lock(&fiq->lock);
 	if (fiq->connected) {
-		queue_request_and_unlock(fiq, req);
+		queue_request_and_unlock(fiq, req, 0);
 	} else {
 		err = -ENODEV;
 		spin_unlock(&fiq->lock);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c1a8b31..363e0ba 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -389,19 +389,19 @@  struct fuse_iqueue_ops {
 	/**
 	 * Signal that a forget has been queued
 	 */
-	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
+	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq, bool sync)
 		__releases(fiq->lock);
 
 	/**
 	 * Signal that an INTERRUPT request has been queued
 	 */
-	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
+	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq, bool sync)
 		__releases(fiq->lock);
 
 	/**
 	 * Signal that a request has been queued
 	 */
-	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
+	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq, bool sync)
 		__releases(fiq->lock);
 
 	/**
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4cfa4bc..bdc3dcc 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -972,7 +972,7 @@  static struct virtio_driver virtio_fs_driver = {
 #endif
 };
 
-static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq, bool sync)
 __releases(fiq->lock)
 {
 	struct fuse_forget_link *link;
@@ -1007,7 +1007,8 @@  __releases(fiq->lock)
 	kfree(link);
 }
 
-static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq,
+						bool sync)
 __releases(fiq->lock)
 {
 	/*
@@ -1222,7 +1223,8 @@  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
 	return ret;
 }
 
-static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq,
+					bool sync)
 __releases(fiq->lock)
 {
 	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2d0df57..690572ee 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -228,6 +228,7 @@  void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
 #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE)
+#define wake_up_sync(x)			__wake_up_sync(x, TASK_NORMAL)
 
 /*
  * Wakeup macros to be used to report events to the targets.