diff mbox series

[PATCHv3] md: Replace md_thread's wait queue with the swait variant

Message ID 20240307120835.87390-1-jinpu.wang@ionos.com (mailing list archive)
State Superseded, archived
Headers show
Series [PATCHv3] md: Replace md_thread's wait queue with the swait variant | expand

Commit Message

Jinpu Wang March 7, 2024, 12:08 p.m. UTC
From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>

Replace md_thread's wait_event()/wake_up() related calls with their
simple swait~ variants to improve performance as well as memory and
stack footprint.

Use the IDLE state for the worker thread put to sleep instead of
misusing the INTERRUPTIBLE state combined with flushing signals
just for not contributing to system's cpu load-average stats.

Also check for sleeping thread before attempting its wake_up in
md_wakeup_thread() for avoiding unnecessary spinlock contention.

With this patch (backported) on a kernel 6.1, the IOPS improved
by around 4% with raid1 and/or raid5, in high IO load scenarios.

Fixes: 93588e2284b6 ("[PATCH] md: make md threads interruptible again")
Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>

---
v3: add reviewed by from Paul, add fixes tag.
v2: fix a type error for thread
 drivers/md/md.c | 23 +++++++----------------
 drivers/md/md.h |  2 +-
 2 files changed, 8 insertions(+), 17 deletions(-)

Comments

Jinpu Wang March 26, 2024, 5:22 a.m. UTC | #1
Hi Song, hi Kuai,

ping, Any comments?

Thx!

On Thu, Mar 7, 2024 at 1:08 PM Jack Wang <jinpu.wang@ionos.com> wrote:
>
> From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
>
> Replace md_thread's wait_event()/wake_up() related calls with their
> simple swait~ variants to improve performance as well as memory and
> stack footprint.
>
> Use the IDLE state for the worker thread put to sleep instead of
> misusing the INTERRUPTIBLE state combined with flushing signals
> just for not contributing to system's cpu load-average stats.
>
> Also check for sleeping thread before attempting its wake_up in
> md_wakeup_thread() for avoiding unnecessary spinlock contention.
>
> With this patch (backported) on a kernel 6.1, the IOPS improved
> by around 4% with raid1 and/or raid5, in high IO load scenarios.
>
> Fixes: 93588e2284b6 ("[PATCH] md: make md threads interruptible again")
> Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
> Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
> Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
>
> ---
> v3: add reviewed by from Paul, add fixes tag.
> v2: fix a type error for thread
>  drivers/md/md.c | 23 +++++++----------------
>  drivers/md/md.h |  2 +-
>  2 files changed, 8 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 48ae2b1cb57a..14d12ee4b06b 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7970,22 +7970,12 @@ static int md_thread(void *arg)
>          * many dirty RAID5 blocks.
>          */
>
> -       allow_signal(SIGKILL);
>         while (!kthread_should_stop()) {
>
> -               /* We need to wait INTERRUPTIBLE so that
> -                * we don't add to the load-average.
> -                * That means we need to be sure no signals are
> -                * pending
> -                */
> -               if (signal_pending(current))
> -                       flush_signals(current);
> -
> -               wait_event_interruptible_timeout
> -                       (thread->wqueue,
> -                        test_bit(THREAD_WAKEUP, &thread->flags)
> -                        || kthread_should_stop() || kthread_should_park(),
> -                        thread->timeout);
> +               swait_event_idle_timeout_exclusive(thread->wqueue,
> +                       test_bit(THREAD_WAKEUP, &thread->flags) ||
> +                       kthread_should_stop() || kthread_should_park(),
> +                       thread->timeout);
>
>                 clear_bit(THREAD_WAKEUP, &thread->flags);
>                 if (kthread_should_park())
> @@ -8017,7 +8007,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
>         if (t) {
>                 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
>                 set_bit(THREAD_WAKEUP, &t->flags);
> -               wake_up(&t->wqueue);
> +               if (swq_has_sleeper(&t->wqueue))
> +                       swake_up_one(&t->wqueue);
>         }
>         rcu_read_unlock();
>  }
> @@ -8032,7 +8023,7 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
>         if (!thread)
>                 return NULL;
>
> -       init_waitqueue_head(&thread->wqueue);
> +       init_swait_queue_head(&thread->wqueue);
>
>         thread->run = run;
>         thread->mddev = mddev;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index b2076a165c10..1dc01bc5c038 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -716,7 +716,7 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
>  struct md_thread {
>         void                    (*run) (struct md_thread *thread);
>         struct mddev            *mddev;
> -       wait_queue_head_t       wqueue;
> +       struct swait_queue_head wqueue;
>         unsigned long           flags;
>         struct task_struct      *tsk;
>         unsigned long           timeout;
> --
> 2.34.1
>
Yu Kuai March 26, 2024, 2:09 p.m. UTC | #2
Hi,

在 2024/03/26 13:22, Jinpu Wang 写道:
> Hi Song, hi Kuai,
> 
> ping, Any comments?
> 
> Thx!
> 
> On Thu, Mar 7, 2024 at 1:08 PM Jack Wang <jinpu.wang@ionos.com> wrote:
>>
>> From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
>>
>> Replace md_thread's wait_event()/wake_up() related calls with their
>> simple swait~ variants to improve performance as well as memory and
>> stack footprint.
>>
>> Use the IDLE state for the worker thread put to sleep instead of
>> misusing the INTERRUPTIBLE state combined with flushing signals
>> just for not contributing to system's cpu load-average stats.
>>
>> Also check for sleeping thread before attempting its wake_up in
>> md_wakeup_thread() for avoiding unnecessary spinlock contention.

I think it'll be better to split this into a seperate patch.
And can you check if we just add wq_has_sleeper(), will there be
performance improvement?

>>
>> With this patch (backported) on a kernel 6.1, the IOPS improved
>> by around 4% with raid1 and/or raid5, in high IO load scenarios.

Can you be more specifical about your test? because from what I know,
IO fast path doesn't involved with daemon thread, and I don't understand
yet where the 4% improvement is from.

Thanks,
Kuai
Florian-Ewald Müller March 26, 2024, 3:51 p.m. UTC | #3
Hello Kuai,

Thank you for your comments and questions.

About our tests:
- we used many (>= 100) logical volumes (each 100 GiB big) on top of 3
x md-raid5s (making up a raid50) on 8 SSDs each.
- further we started a fio instance doing random rd/wr (also of random
sizes) on each of those volumes,

And, yes indeed, as you suggested, we observed some performance change
already after adding (first) wq_has_sleeper().
The fio IOPS results improved by around 3-4% - but, in my experience,
every fluctuation <= 3% can also have other reasons (e.g. different
timings, temperature on SSDs, etc.).

Only afterwards, I've also changed the wait queue construct with the
swait variant (sufficient here), as its wake up path is simpler,
faster and with a smaller stack footprint.
Also used the (now since some years available) IDLE state for the
waiting thread to eliminate the (not anymore necessary) checking and
flushing of signals.
This second round of changes, although theoretically an improvement,
didn't bring any measurable performance increase, though.
This was more or less expected.

If you think, the simple adding of the wq_has_sleeper() is more
justifiable, please apply only this change.
Later today, I'll also send you a patch containing only this simple change.

Best regards,
Florian

On Tue, Mar 26, 2024 at 3:09 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2024/03/26 13:22, Jinpu Wang 写道:
> > Hi Song, hi Kuai,
> >
> > ping, Any comments?
> >
> > Thx!
> >
> > On Thu, Mar 7, 2024 at 1:08 PM Jack Wang <jinpu.wang@ionos.com> wrote:
> >>
> >> From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
> >>
> >> Replace md_thread's wait_event()/wake_up() related calls with their
> >> simple swait~ variants to improve performance as well as memory and
> >> stack footprint.
> >>
> >> Use the IDLE state for the worker thread put to sleep instead of
> >> misusing the INTERRUPTIBLE state combined with flushing signals
> >> just for not contributing to system's cpu load-average stats.
> >>
> >> Also check for sleeping thread before attempting its wake_up in
> >> md_wakeup_thread() for avoiding unnecessary spinlock contention.
>
> I think it'll be better to split this into a seperate patch.
> And can you check if we just add wq_has_sleeper(), will there be
> performance improvement?
>
> >>
> >> With this patch (backported) on a kernel 6.1, the IOPS improved
> >> by around 4% with raid1 and/or raid5, in high IO load scenarios.
>
> Can you be more specifical about your test? because from what I know,
> IO fast path doesn't involved with daemon thread, and I don't understand
> yet where the 4% improvement is from.
>
> Thanks,
> Kuai
>
Florian-Ewald Müller March 26, 2024, 7:49 p.m. UTC | #4
Hi Kuai,

here (attached) the promised simple patch.

Best,
Florian

On Tue, Mar 26, 2024 at 4:51 PM Florian-Ewald Müller
<florian-ewald.mueller@ionos.com> wrote:
>
> Hello Kuai,
>
> Thank you for your comments and questions.
>
> About our tests:
> - we used many (>= 100) logical volumes (each 100 GiB big) on top of 3
> x md-raid5s (making up a raid50) on 8 SSDs each.
> - further we started a fio instance doing random rd/wr (also of random
> sizes) on each of those volumes,
>
> And, yes indeed, as you suggested, we observed some performance change
> already after adding (first) wq_has_sleeper().
> The fio IOPS results improved by around 3-4% - but, in my experience,
> every fluctuation <= 3% can also have other reasons (e.g. different
> timings, temperature on SSDs, etc.).
>
> Only afterwards, I've also changed the wait queue construct with the
> swait variant (sufficient here), as its wake up path is simpler,
> faster and with a smaller stack footprint.
> Also used the (now since some years available) IDLE state for the
> waiting thread to eliminate the (not anymore necessary) checking and
> flushing of signals.
> This second round of changes, although theoretically an improvement,
> didn't bring any measurable performance increase, though.
> This was more or less expected.
>
> If you think, the simple adding of the wq_has_sleeper() is more
> justifiable, please apply only this change.
> Later today, I'll also send you a patch containing only this simple change.
>
> Best regards,
> Florian
>
> On Tue, Mar 26, 2024 at 3:09 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >
> > Hi,
> >
> > 在 2024/03/26 13:22, Jinpu Wang 写道:
> > > Hi Song, hi Kuai,
> > >
> > > ping, Any comments?
> > >
> > > Thx!
> > >
> > > On Thu, Mar 7, 2024 at 1:08 PM Jack Wang <jinpu.wang@ionos.com> wrote:
> > >>
> > >> From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
> > >>
> > >> Replace md_thread's wait_event()/wake_up() related calls with their
> > >> simple swait~ variants to improve performance as well as memory and
> > >> stack footprint.
> > >>
> > >> Use the IDLE state for the worker thread put to sleep instead of
> > >> misusing the INTERRUPTIBLE state combined with flushing signals
> > >> just for not contributing to system's cpu load-average stats.
> > >>
> > >> Also check for sleeping thread before attempting its wake_up in
> > >> md_wakeup_thread() for avoiding unnecessary spinlock contention.
> >
> > I think it'll be better to split this into a seperate patch.
> > And can you check if we just add wq_has_sleeper(), will there be
> > performance improvement?
> >
> > >>
> > >> With this patch (backported) on a kernel 6.1, the IOPS improved
> > >> by around 4% with raid1 and/or raid5, in high IO load scenarios.
> >
> > Can you be more specifical about your test? because from what I know,
> > IO fast path doesn't involved with daemon thread, and I don't understand
> > yet where the 4% improvement is from.
> >
> > Thanks,
> > Kuai
> >
Yu Kuai March 27, 2024, 1:07 a.m. UTC | #5
Hi,

在 2024/03/27 3:49, Florian-Ewald Müller 写道:
> Hi Kuai,
> 
> here (attached) the promised simple patch.
> 

You can send this patch, LGTM.

Thanks,
Kuai

> Best,
> Florian
> 
> On Tue, Mar 26, 2024 at 4:51 PM Florian-Ewald Müller
> <florian-ewald.mueller@ionos.com> wrote:
>>
>> Hello Kuai,
>>
>> Thank you for your comments and questions.
>>
>> About our tests:
>> - we used many (>= 100) logical volumes (each 100 GiB big) on top of 3
>> x md-raid5s (making up a raid50) on 8 SSDs each.
>> - further we started a fio instance doing random rd/wr (also of random
>> sizes) on each of those volumes,
>>
>> And, yes indeed, as you suggested, we observed some performance change
>> already after adding (first) wq_has_sleeper().
>> The fio IOPS results improved by around 3-4% - but, in my experience,
>> every fluctuation <= 3% can also have other reasons (e.g. different
>> timings, temperature on SSDs, etc.).
>>
>> Only afterwards, I've also changed the wait queue construct with the
>> swait variant (sufficient here), as its wake up path is simpler,
>> faster and with a smaller stack footprint.
>> Also used the (now since some years available) IDLE state for the
>> waiting thread to eliminate the (not anymore necessary) checking and
>> flushing of signals.
>> This second round of changes, although theoretically an improvement,
>> didn't bring any measurable performance increase, though.
>> This was more or less expected.
>>
>> If you think, the simple adding of the wq_has_sleeper() is more
>> justifiable, please apply only this change.
>> Later today, I'll also send you a patch containing only this simple change.
>>
>> Best regards,
>> Florian
>>
>> On Tue, Mar 26, 2024 at 3:09 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>
>>> Hi,
>>>
>>> 在 2024/03/26 13:22, Jinpu Wang 写道:
>>>> Hi Song, hi Kuai,
>>>>
>>>> ping, Any comments?
>>>>
>>>> Thx!
>>>>
>>>> On Thu, Mar 7, 2024 at 1:08 PM Jack Wang <jinpu.wang@ionos.com> wrote:
>>>>>
>>>>> From: Florian-Ewald Mueller <florian-ewald.mueller@ionos.com>
>>>>>
>>>>> Replace md_thread's wait_event()/wake_up() related calls with their
>>>>> simple swait~ variants to improve performance as well as memory and
>>>>> stack footprint.
>>>>>
>>>>> Use the IDLE state for the worker thread put to sleep instead of
>>>>> misusing the INTERRUPTIBLE state combined with flushing signals
>>>>> just for not contributing to system's cpu load-average stats.
>>>>>
>>>>> Also check for sleeping thread before attempting its wake_up in
>>>>> md_wakeup_thread() for avoiding unnecessary spinlock contention.
>>>
>>> I think it'll be better to split this into a seperate patch.
>>> And can you check if we just add wq_has_sleeper(), will there be
>>> performance improvement?
>>>
>>>>>
>>>>> With this patch (backported) on a kernel 6.1, the IOPS improved
>>>>> by around 4% with raid1 and/or raid5, in high IO load scenarios.
>>>
>>> Can you be more specifical about your test? because from what I know,
>>> IO fast path doesn't involved with daemon thread, and I don't understand
>>> yet where the 4% improvement is from.
>>>
>>> Thanks,
>>> Kuai
>>>
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 48ae2b1cb57a..14d12ee4b06b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7970,22 +7970,12 @@  static int md_thread(void *arg)
 	 * many dirty RAID5 blocks.
 	 */
 
-	allow_signal(SIGKILL);
 	while (!kthread_should_stop()) {
 
-		/* We need to wait INTERRUPTIBLE so that
-		 * we don't add to the load-average.
-		 * That means we need to be sure no signals are
-		 * pending
-		 */
-		if (signal_pending(current))
-			flush_signals(current);
-
-		wait_event_interruptible_timeout
-			(thread->wqueue,
-			 test_bit(THREAD_WAKEUP, &thread->flags)
-			 || kthread_should_stop() || kthread_should_park(),
-			 thread->timeout);
+		swait_event_idle_timeout_exclusive(thread->wqueue,
+			test_bit(THREAD_WAKEUP, &thread->flags) ||
+			kthread_should_stop() || kthread_should_park(),
+			thread->timeout);
 
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 		if (kthread_should_park())
@@ -8017,7 +8007,8 @@  void md_wakeup_thread(struct md_thread __rcu *thread)
 	if (t) {
 		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
 		set_bit(THREAD_WAKEUP, &t->flags);
-		wake_up(&t->wqueue);
+		if (swq_has_sleeper(&t->wqueue))
+			swake_up_one(&t->wqueue);
 	}
 	rcu_read_unlock();
 }
@@ -8032,7 +8023,7 @@  struct md_thread *md_register_thread(void (*run) (struct md_thread *),
 	if (!thread)
 		return NULL;
 
-	init_waitqueue_head(&thread->wqueue);
+	init_swait_queue_head(&thread->wqueue);
 
 	thread->run = run;
 	thread->mddev = mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b2076a165c10..1dc01bc5c038 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -716,7 +716,7 @@  static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 struct md_thread {
 	void			(*run) (struct md_thread *thread);
 	struct mddev		*mddev;
-	wait_queue_head_t	wqueue;
+	struct swait_queue_head	wqueue;
 	unsigned long		flags;
 	struct task_struct	*tsk;
 	unsigned long		timeout;