diff mbox series

[Resend,v3] psi: fix possible trigger missing in the window

Message ID 1642649516-15076-1-git-send-email-huangzhaoyang@gmail.com (mailing list archive)
State New
Headers show
Series [Resend,v3] psi: fix possible trigger missing in the window | expand

Commit Message

Zhaoyang Huang Jan. 20, 2022, 3:31 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

When a new threshold breaching stall happens after a psi event was
generated and within the window duration, the new event is not
generated because the events are rate-limited to one per window. If
after that no new stall is recorded then the event will not be
generated even after rate-limiting duration has passed. This is
happening because with no new stall, window_update will not be called
even though threshold was previously breached. To fix this, record
threshold breaching occurrence and generate the event once window
duration is passed.

Suggested-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
v2: modify the logic according to Suren's suggestion
v3: update commit message
---
---
 include/linux/psi_types.h |  2 ++
 kernel/sched/psi.c        | 38 +++++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 15 deletions(-)

Comments

Zhaoyang Huang Jan. 22, 2022, 9:50 a.m. UTC | #1
sorry for broadcasting here. Is anyone aware of Johannes and Peter's
status? The following patch has been waiting for their comments for a
long time. Are they available recently?  thanks.

On Thu, Jan 20, 2022 at 11:32 AM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> When a new threshold breaching stall happens after a psi event was
> generated and within the window duration, the new event is not
> generated because the events are rate-limited to one per window. If
> after that no new stall is recorded then the event will not be
> generated even after rate-limiting duration has passed. This is
> happening because with no new stall, window_update will not be called
> even though threshold was previously breached. To fix this, record
> threshold breaching occurrence and generate the event once window
> duration is passed.
>
> Suggested-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
> v2: modify the logic according to Suren's suggestion
> v3: update commit message
> ---
> ---
>  include/linux/psi_types.h |  2 ++
>  kernel/sched/psi.c        | 38 +++++++++++++++++++++++---------------
>  2 files changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 0a23300..87b694a 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -132,6 +132,8 @@ struct psi_trigger {
>
>         /* Refcounting to prevent premature destruction */
>         struct kref refcount;
> +
> +       bool threshold_breach;
>  };
>
>  struct psi_group {
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 1652f2b..5c67ab9 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -524,24 +524,29 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>          */
>         list_for_each_entry(t, &group->triggers, node) {
>                 u64 growth;
> +               bool trigger_stalled =
> +                       group->polling_total[t->state] != total[t->state];
>
> -               /* Check for stall activity */
> -               if (group->polling_total[t->state] == total[t->state])
> -                       continue;
> -
> -               /*
> -                * Multiple triggers might be looking at the same state,
> -                * remember to update group->polling_total[] once we've
> -                * been through all of them. Also remember to extend the
> -                * polling time if we see new stall activity.
> -                */
> -               new_stall = true;
> -
> -               /* Calculate growth since last update */
> -               growth = window_update(&t->win, now, total[t->state]);
> -               if (growth < t->threshold)
> +               /* Check for stall activity or a previous threshold breach */
> +               if (!trigger_stalled && !t->threshold_breach)
>                         continue;
>
> +               if (trigger_stalled) {
> +                       /*
> +                        * Multiple triggers might be looking at the same state,
> +                        * remember to update group->polling_total[] once we've
> +                        * been through all of them. Also remember to extend the
> +                        * polling time if we see new stall activity.
> +                        */
> +                       new_stall = true;
> +
> +                       /* Calculate growth since last update */
> +                       growth = window_update(&t->win, now, total[t->state]);
> +                       if (growth < t->threshold)
> +                               continue;
> +
> +                       t->threshold_breach = true;
> +               }
>                 /* Limit event signaling to once per window */
>                 if (now < t->last_event_time + t->win.size)
>                         continue;
> @@ -550,6 +555,8 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>                 if (cmpxchg(&t->event, 0, 1) == 0)
>                         wake_up_interruptible(&t->event_wait);
>                 t->last_event_time = now;
> +               /* Reset threshold breach flag once event got generated */
> +               t->threshold_breach = false;
>         }
>
>         if (new_stall)
> @@ -1152,6 +1159,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>         t->last_event_time = 0;
>         init_waitqueue_head(&t->event_wait);
>         kref_init(&t->refcount);
> +       t->threshold_breach = false;
>
>         mutex_lock(&group->trigger_lock);
>
> --
> 1.9.1
>
Johannes Weiner Jan. 24, 2022, 3:53 p.m. UTC | #2
On Thu, Jan 20, 2022 at 11:31:56AM +0800, Huangzhaoyang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> When a new threshold breaching stall happens after a psi event was
> generated and within the window duration, the new event is not
> generated because the events are rate-limited to one per window. If
> after that no new stall is recorded then the event will not be
> generated even after rate-limiting duration has passed. This is
> happening because with no new stall, window_update will not be called
> even though threshold was previously breached. To fix this, record
> threshold breaching occurrence and generate the event once window
> duration is passed.
> 
> Suggested-by: Suren Baghdasaryan <surenb@google.com>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Good catch. The change makes sense to me.

However, I had to re-read the discussion to understand *why*
triggering once per window can be a practical problem. Could you
please include the lkmd scenario you mentioned?

Suren, even though it's your suggested code, can you please also add
ack/review tags? Thanks!

Some minor inline comments:

> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 0a23300..87b694a 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -132,6 +132,8 @@ struct psi_trigger {
>  
>  	/* Refcounting to prevent premature destruction */
>  	struct kref refcount;
> +
> +	bool threshold_breach;

Something like bool pending_event would be more descriptive, IMO.

Also please remember to add a short comment like we have for the other
struct members. For example:

	/* Deferred event(s) from previous ratelimit window */

> @@ -524,24 +524,29 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>  	 */
>  	list_for_each_entry(t, &group->triggers, node) {
>  		u64 growth;
> +		bool trigger_stalled =
> +			group->polling_total[t->state] != total[t->state];

Triggers don't stall, they trigger on stalls. How about this:

		bool new_stall;
		u64 growth;

		new_stall = group->polling_total[t->state] != total[t->state];

(order local declarations by length, avoid line wraps where possible)

> -		/* Check for stall activity */
> -		if (group->polling_total[t->state] == total[t->state])
> -			continue;
> -
> -		/*
> -		 * Multiple triggers might be looking at the same state,
> -		 * remember to update group->polling_total[] once we've
> -		 * been through all of them. Also remember to extend the
> -		 * polling time if we see new stall activity.
> -		 */
> -		new_stall = true;
> -
> -		/* Calculate growth since last update */
> -		growth = window_update(&t->win, now, total[t->state]);
> -		if (growth < t->threshold)
> +		/* Check for stall activity or a previous threshold breach */
> +		if (!trigger_stalled && !t->threshold_breach)
>  			continue;

This could use a bit more explanation imo:

		/*
		 * Check for new stall activity, as well as deferred
		 * events that occurred in the last window after the
		 * trigger had already fired (we want to ratelimit
		 * events without dropping any).
		 */
		if (!new_stall && !t->pending_event)
			continue;

> +		if (trigger_stalled) {
> +			/*
> +			 * Multiple triggers might be looking at the same state,
> +			 * remember to update group->polling_total[] once we've
> +			 * been through all of them. Also remember to extend the
> +			 * polling time if we see new stall activity.
> +			 */
> +			new_stall = true;

and then rename this flag `update_total'.

> +			/* Calculate growth since last update */
> +			growth = window_update(&t->win, now, total[t->state]);
> +			if (growth < t->threshold)
> +				continue;
> +
> +			t->threshold_breach = true;
> +		}
>  		/* Limit event signaling to once per window */
>  		if (now < t->last_event_time + t->win.size)
>  			continue;
> @@ -550,6 +555,8 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>  		if (cmpxchg(&t->event, 0, 1) == 0)
>  			wake_up_interruptible(&t->event_wait);
>  		t->last_event_time = now;
> +		/* Reset threshold breach flag once event got generated */
> +		t->threshold_breach = false;
>  	}
>  
>  	if (new_stall)
> @@ -1152,6 +1159,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  	t->last_event_time = 0;
>  	init_waitqueue_head(&t->event_wait);
>  	kref_init(&t->refcount);
> +	t->threshold_breach = false;
>  
>  	mutex_lock(&group->trigger_lock);

Thanks!
Suren Baghdasaryan Jan. 24, 2022, 4:37 p.m. UTC | #3
On Mon, Jan 24, 2022 at 7:53 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, Jan 20, 2022 at 11:31:56AM +0800, Huangzhaoyang wrote:
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > When a new threshold breaching stall happens after a psi event was
> > generated and within the window duration, the new event is not
> > generated because the events are rate-limited to one per window. If
> > after that no new stall is recorded then the event will not be
> > generated even after rate-limiting duration has passed. This is
> > happening because with no new stall, window_update will not be called
> > even though threshold was previously breached. To fix this, record
> > threshold breaching occurrence and generate the event once window
> > duration is passed.
> >
> > Suggested-by: Suren Baghdasaryan <surenb@google.com>
> > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> Good catch. The change makes sense to me.
>
> However, I had to re-read the discussion to understand *why*
> triggering once per window can be a practical problem. Could you
> please include the lkmd scenario you mentioned?
>
> Suren, even though it's your suggested code, can you please also add
> ack/review tags? Thanks!

Will do as soon as the new version is posted and your comments are
addressed. Thanks!

>
> Some minor inline comments:
>
> > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > index 0a23300..87b694a 100644
> > --- a/include/linux/psi_types.h
> > +++ b/include/linux/psi_types.h
> > @@ -132,6 +132,8 @@ struct psi_trigger {
> >
> >       /* Refcounting to prevent premature destruction */
> >       struct kref refcount;
> > +
> > +     bool threshold_breach;
>
> Something like bool pending_event would be more descriptive, IMO.
>
> Also please remember to add a short comment like we have for the other
> struct members. For example:
>
>         /* Deferred event(s) from previous ratelimit window */
>
> > @@ -524,24 +524,29 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> >        */
> >       list_for_each_entry(t, &group->triggers, node) {
> >               u64 growth;
> > +             bool trigger_stalled =
> > +                     group->polling_total[t->state] != total[t->state];
>
> Triggers don't stall, they trigger on stalls. How about this:
>
>                 bool new_stall;
>                 u64 growth;
>
>                 new_stall = group->polling_total[t->state] != total[t->state];
>
> (order local declarations by length, avoid line wraps where possible)
>
> > -             /* Check for stall activity */
> > -             if (group->polling_total[t->state] == total[t->state])
> > -                     continue;
> > -
> > -             /*
> > -              * Multiple triggers might be looking at the same state,
> > -              * remember to update group->polling_total[] once we've
> > -              * been through all of them. Also remember to extend the
> > -              * polling time if we see new stall activity.
> > -              */
> > -             new_stall = true;
> > -
> > -             /* Calculate growth since last update */
> > -             growth = window_update(&t->win, now, total[t->state]);
> > -             if (growth < t->threshold)
> > +             /* Check for stall activity or a previous threshold breach */
> > +             if (!trigger_stalled && !t->threshold_breach)
> >                       continue;
>
> This could use a bit more explanation imo:
>
>                 /*
>                  * Check for new stall activity, as well as deferred
>                  * events that occurred in the last window after the
>                  * trigger had already fired (we want to ratelimit
>                  * events without dropping any).
>                  */
>                 if (!new_stall && !t->pending_event)
>                         continue;
>
> > +             if (trigger_stalled) {
> > +                     /*
> > +                      * Multiple triggers might be looking at the same state,
> > +                      * remember to update group->polling_total[] once we've
> > +                      * been through all of them. Also remember to extend the
> > +                      * polling time if we see new stall activity.
> > +                      */
> > +                     new_stall = true;
>
> and then rename this flag `update_total'.
>
> > +                     /* Calculate growth since last update */
> > +                     growth = window_update(&t->win, now, total[t->state]);
> > +                     if (growth < t->threshold)
> > +                             continue;
> > +
> > +                     t->threshold_breach = true;
> > +             }
> >               /* Limit event signaling to once per window */
> >               if (now < t->last_event_time + t->win.size)
> >                       continue;
> > @@ -550,6 +555,8 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> >               if (cmpxchg(&t->event, 0, 1) == 0)
> >                       wake_up_interruptible(&t->event_wait);
> >               t->last_event_time = now;
> > +             /* Reset threshold breach flag once event got generated */
> > +             t->threshold_breach = false;
> >       }
> >
> >       if (new_stall)
> > @@ -1152,6 +1159,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> >       t->last_event_time = 0;
> >       init_waitqueue_head(&t->event_wait);
> >       kref_init(&t->refcount);
> > +     t->threshold_breach = false;
> >
> >       mutex_lock(&group->trigger_lock);
>
> Thanks!
diff mbox series

Patch

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0a23300..87b694a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -132,6 +132,8 @@  struct psi_trigger {
 
 	/* Refcounting to prevent premature destruction */
 	struct kref refcount;
+
+	bool threshold_breach;
 };
 
 struct psi_group {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2b..5c67ab9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -524,24 +524,29 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 	 */
 	list_for_each_entry(t, &group->triggers, node) {
 		u64 growth;
+		bool trigger_stalled =
+			group->polling_total[t->state] != total[t->state];
 
-		/* Check for stall activity */
-		if (group->polling_total[t->state] == total[t->state])
-			continue;
-
-		/*
-		 * Multiple triggers might be looking at the same state,
-		 * remember to update group->polling_total[] once we've
-		 * been through all of them. Also remember to extend the
-		 * polling time if we see new stall activity.
-		 */
-		new_stall = true;
-
-		/* Calculate growth since last update */
-		growth = window_update(&t->win, now, total[t->state]);
-		if (growth < t->threshold)
+		/* Check for stall activity or a previous threshold breach */
+		if (!trigger_stalled && !t->threshold_breach)
 			continue;
 
+		if (trigger_stalled) {
+			/*
+			 * Multiple triggers might be looking at the same state,
+			 * remember to update group->polling_total[] once we've
+			 * been through all of them. Also remember to extend the
+			 * polling time if we see new stall activity.
+			 */
+			new_stall = true;
+
+			/* Calculate growth since last update */
+			growth = window_update(&t->win, now, total[t->state]);
+			if (growth < t->threshold)
+				continue;
+
+			t->threshold_breach = true;
+		}
 		/* Limit event signaling to once per window */
 		if (now < t->last_event_time + t->win.size)
 			continue;
@@ -550,6 +555,8 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 		if (cmpxchg(&t->event, 0, 1) == 0)
 			wake_up_interruptible(&t->event_wait);
 		t->last_event_time = now;
+		/* Reset threshold breach flag once event got generated */
+		t->threshold_breach = false;
 	}
 
 	if (new_stall)
@@ -1152,6 +1159,7 @@  struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
 	kref_init(&t->refcount);
+	t->threshold_breach = false;
 
 	mutex_lock(&group->trigger_lock);