diff mbox series

[2/6] fuse: Optimize request_end() by not taking fiq->waitq.lock

Message ID 154149663862.17764.9649077325029198892.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series fuse: Interrupt-related optimizations and improvements | expand

Commit Message

Kirill Tkhai Nov. 6, 2018, 9:30 a.m. UTC
We take global fiq->waitq.lock every time, when we are
in this function, but interrupted requests are just small
subset of all requests. This patch optimizes request_end()
and makes it to take the lock when it's really needed.

queue_interrupt() needs small change for that. After req
is linked to interrupt list, we do smp_mb() and check
for FR_FINISHED again. In case of FR_FINISHED bit has
appeared, we remove req and leave the function:

CPU 0                                                CPU 1
queue_interrupt()                                    request_end()

  spin_lock(&fiq->waitq.lock)


  list_add_tail(&req->intr_entry, &fiq->interrupts)    test_and_set_bit(FR_FINISHED, &req->flags)
  smp_mb()                                             <memory barrier implied test_and_set_bit()>
  if (test_bit(FR_FINISHED, &req->flags))              if (!list_empty(&req->intr_entry))

    list_del_init(&req->intr_entry)                      spin_lock(&fiq->waitq.lock)
                                                         list_del_init(&req->intr_entry)

Check the change is visible in perf report:

1)Firstly mount fusexmp_fh:
  $fuse/example/.libs/fusexmp_fh mnt

2)Run test doing
    futimes(fd, tv1);
    futimes(fd, tv2);
  in many threads endlessly.

3)perf record -g (all the system load)

Without the patch in request_end() we spend 62.58% of do_write() time:
(= 12.58 / 20.10 * 100%)

   55,22% entry_SYSCALL_64
     20,10% do_writev
      ...
          18,08% fuse_dev_do_write
           12,58% request_end
            10,08% __wake_up_common_lock
            1,97% queued_spin_lock_slowpath
           1,31% fuse_copy_args
           1,04% fuse_copy_one
           0,85% queued_spin_lock_slowpath

With the patch, the perf report becomes better, and only 58.16%
of do_write() time we spend in request_end():

   54,15% entry_SYSCALL_64
     18,24% do_writev
      ...
          16,25% fuse_dev_do_write
           10,61% request_end
            10,25% __wake_up_common_lock
           1,34% fuse_copy_args
           1,06% fuse_copy_one
           0,88% queued_spin_lock_slowpath

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/fuse/dev.c |   30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

Comments

Miklos Szeredi Nov. 7, 2018, 1:09 p.m. UTC | #1
On Tue, Nov 6, 2018 at 10:30 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> We take global fiq->waitq.lock every time, when we are
> in this function, but interrupted requests are just small
> subset of all requests. This patch optimizes request_end()
> and makes it to take the lock when it's really needed.
>
> queue_interrupt() needs small change for that. After req
> is linked to interrupt list, we do smp_mb() and check
> for FR_FINISHED again. In case of FR_FINISHED bit has
> appeared, we remove req and leave the function:
>
> CPU 0                                                CPU 1
> queue_interrupt()                                    request_end()
>
>   spin_lock(&fiq->waitq.lock)
>
>
>   list_add_tail(&req->intr_entry, &fiq->interrupts)    test_and_set_bit(FR_FINISHED, &req->flags)
>   smp_mb()                                             <memory barrier implied test_and_set_bit()>
>   if (test_bit(FR_FINISHED, &req->flags))              if (!list_empty(&req->intr_entry))
>
>     list_del_init(&req->intr_entry)                      spin_lock(&fiq->waitq.lock)
>                                                          list_del_init(&req->intr_entry)
>
> Check the change is visible in perf report:
>
> 1)Firstly mount fusexmp_fh:
>   $fuse/example/.libs/fusexmp_fh mnt
>
> 2)Run test doing
>     futimes(fd, tv1);
>     futimes(fd, tv2);
>   in many threads endlessly.
>
> 3)perf record -g (all the system load)
>
> Without the patch in request_end() we spend 62.58% of do_write() time:
> (= 12.58 / 20.10 * 100%)
>
>    55,22% entry_SYSCALL_64
>      20,10% do_writev
>       ...
>           18,08% fuse_dev_do_write
>            12,58% request_end
>             10,08% __wake_up_common_lock
>             1,97% queued_spin_lock_slowpath
>            1,31% fuse_copy_args
>            1,04% fuse_copy_one
>            0,85% queued_spin_lock_slowpath
>
> With the patch, the perf report becomes better, and only 58.16%
> of do_write() time we spend in request_end():
>
>    54,15% entry_SYSCALL_64
>      18,24% do_writev
>       ...
>           16,25% fuse_dev_do_write
>            10,61% request_end
>             10,25% __wake_up_common_lock
>            1,34% fuse_copy_args
>            1,06% fuse_copy_one
>            0,88% queued_spin_lock_slowpath
>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  fs/fuse/dev.c |   30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 7705f75c77a3..391498e680ec 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -427,10 +427,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
>
>         if (test_and_set_bit(FR_FINISHED, &req->flags))
>                 goto put_request;
> -
> -       spin_lock(&fiq->waitq.lock);
> -       list_del_init(&req->intr_entry);
> -       spin_unlock(&fiq->waitq.lock);
> +       /*
> +        * test_and_set_bit() implies smp_mb() between bit
> +        * changing and below intr_entry check. Pairs with
> +        * smp_mb() from queue_interrupt().
> +        */
> +       if (!list_empty(&req->intr_entry)) {
> +               spin_lock(&fiq->waitq.lock);
> +               list_del_init(&req->intr_entry);
> +               spin_unlock(&fiq->waitq.lock);
> +       }
>         WARN_ON(test_bit(FR_PENDING, &req->flags));
>         WARN_ON(test_bit(FR_SENT, &req->flags));
>         if (test_bit(FR_BACKGROUND, &req->flags)) {
> @@ -470,13 +476,21 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
>  {
>         bool kill = false;
>
> -       spin_lock(&fiq->waitq.lock);
> -       if (test_bit(FR_FINISHED, &req->flags)) {
> -               spin_unlock(&fiq->waitq.lock);
> +       if (test_bit(FR_FINISHED, &req->flags))

The only case this test would make sense if this was a performance
sensitive path, which it's not.

>                 return;
> -       }
> +       spin_lock(&fiq->waitq.lock);
>         if (list_empty(&req->intr_entry)) {
>                 list_add_tail(&req->intr_entry, &fiq->interrupts);
> +               /*
> +                * Pairs with smp_mb() implied by test_and_set_bit()
> +                * from request_end().
> +                */
> +               smp_mb();
> +               if (test_bit(FR_FINISHED, &req->flags)) {
> +                       list_del_init(&req->intr_entry);
> +                       spin_unlock(&fiq->waitq.lock);
> +                       return;
> +               }
>                 wake_up_locked(&fiq->waitq);
>                 kill = true;
>         }
>
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7705f75c77a3..391498e680ec 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -427,10 +427,16 @@  static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 
 	if (test_and_set_bit(FR_FINISHED, &req->flags))
 		goto put_request;
-
-	spin_lock(&fiq->waitq.lock);
-	list_del_init(&req->intr_entry);
-	spin_unlock(&fiq->waitq.lock);
+	/*
+	 * test_and_set_bit() implies smp_mb() between bit
+	 * changing and below intr_entry check. Pairs with
+	 * smp_mb() from queue_interrupt().
+	 */
+	if (!list_empty(&req->intr_entry)) {
+		spin_lock(&fiq->waitq.lock);
+		list_del_init(&req->intr_entry);
+		spin_unlock(&fiq->waitq.lock);
+	}
 	WARN_ON(test_bit(FR_PENDING, &req->flags));
 	WARN_ON(test_bit(FR_SENT, &req->flags));
 	if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -470,13 +476,21 @@  static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	bool kill = false;
 
-	spin_lock(&fiq->waitq.lock);
-	if (test_bit(FR_FINISHED, &req->flags)) {
-		spin_unlock(&fiq->waitq.lock);
+	if (test_bit(FR_FINISHED, &req->flags))
 		return;
-	}
+	spin_lock(&fiq->waitq.lock);
 	if (list_empty(&req->intr_entry)) {
 		list_add_tail(&req->intr_entry, &fiq->interrupts);
+		/*
+		 * Pairs with smp_mb() implied by test_and_set_bit()
+		 * from request_end().
+		 */
+		smp_mb();
+		if (test_bit(FR_FINISHED, &req->flags)) {
+			list_del_init(&req->intr_entry);
+			spin_unlock(&fiq->waitq.lock);
+			return;
+		}
 		wake_up_locked(&fiq->waitq);
 		kill = true;
 	}