diff mbox series

psi: fix possible trigger missing in the window

Message ID 1639721264-12294-1-git-send-email-huangzhaoyang@gmail.com (mailing list archive)
State New
Headers show
Series psi: fix possible trigger missing in the window | expand

Commit Message

Zhaoyang Huang Dec. 17, 2021, 6:07 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

There could be missing wake up if the rest of the window remain the
same stall states as the polling_total updates for every polling_min_period.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 include/linux/psi_types.h |  2 ++
 kernel/sched/psi.c        | 30 ++++++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

Comments

Zhaoyang Huang Dec. 18, 2021, 6:03 a.m. UTC | #1
loop Suren

On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> There could be missing wake up if the rest of the window remain the
> same stall states as the polling_total updates for every polling_min_period.
>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
>  include/linux/psi_types.h |  2 ++
>  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
>  2 files changed, 20 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 0a23300..9533d2e 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -132,6 +132,8 @@ struct psi_trigger {
>
>         /* Refcounting to prevent premature destruction */
>         struct kref refcount;
> +
> +       bool new_stall;
>  };
>
>  struct psi_group {
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 1652f2b..402718c 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
>  static void window_reset(struct psi_window *win, u64 now, u64 value,
>                          u64 prev_growth)
>  {
> +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> +
>         win->start_time = now;
>         win->start_value = value;
>         win->prev_growth = prev_growth;
> +       t->new_stall = false;
>  }
>
>  /*
> @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
>  static u64 update_triggers(struct psi_group *group, u64 now)
>  {
>         struct psi_trigger *t;
> -       bool new_stall = false;
>         u64 *total = group->total[PSI_POLL];
>
>         /*
> @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>          * watchers know when their specified thresholds are exceeded.
>          */
>         list_for_each_entry(t, &group->triggers, node) {
> -               u64 growth;
> -
>                 /* Check for stall activity */
>                 if (group->polling_total[t->state] == total[t->state])
>                         continue;
>
>                 /*
> -                * Multiple triggers might be looking at the same state,
> -                * remember to update group->polling_total[] once we've
> -                * been through all of them. Also remember to extend the
> -                * polling time if we see new stall activity.
> +                * update the trigger if there is new stall which will be
> +                * reset when run out of the window
>                  */
> -               new_stall = true;
> +               t->new_stall = true;
> +
> +               memcpy(&group->polling_total[t->state], &total[t->state],
> +                               sizeof(group->polling_total[t->state]));
> +       }
> +
> +       list_for_each_entry(t, &group->triggers, node) {
> +               u64 growth;
> +
> +               /* check if new stall happened during this window*/
> +               if (!t->new_stall)
> +                       continue;
>
>                 /* Calculate growth since last update */
>                 growth = window_update(&t->win, now, total[t->state]);
> @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>                 t->last_event_time = now;
>         }
>
> -       if (new_stall)
> -               memcpy(group->polling_total, total,
> -                               sizeof(group->polling_total));
> -
>         return now + group->poll_min_period;
>  }
>
> @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>         t->last_event_time = 0;
>         init_waitqueue_head(&t->event_wait);
>         kref_init(&t->refcount);
> +       t->new_stall = false;
>
>         mutex_lock(&group->trigger_lock);
>
> --
> 1.9.1
>
Suren Baghdasaryan Dec. 20, 2021, 7:58 p.m. UTC | #2
On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> loop Suren

Thanks.


>
> On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> >
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > There could be missing wake up if the rest of the window remain the
> > same stall states as the polling_total updates for every polling_min_period.

Could you please expand on this description? I'm unclear what the
problem is. I assume "polling_min_period" in this description refers
to the group->poll_min_period.

From the code, looks like the change results in update_triggers()
calling window_update() once there was a new stall recorded for the
trigger state and until the tracking window is complete. I don't see
the point of calling window_update() if there was no stall change
since the last call to window_update(). The resulting growth will not
increase if there is no new stall.
Maybe what you want to achieve here is more than one trigger per
window if the stall limit was breached? If so, then this goes against
the design for psi triggers in which we want to rate-limit the number
of generated triggers per tracking window (see:
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
Please clarify the issue and the intentions here.
Thanks!

> >
> > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > ---
> >  include/linux/psi_types.h |  2 ++
> >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> >  2 files changed, 20 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > index 0a23300..9533d2e 100644
> > --- a/include/linux/psi_types.h
> > +++ b/include/linux/psi_types.h
> > @@ -132,6 +132,8 @@ struct psi_trigger {
> >
> >         /* Refcounting to prevent premature destruction */
> >         struct kref refcount;
> > +
> > +       bool new_stall;
> >  };
> >
> >  struct psi_group {
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index 1652f2b..402718c 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> >                          u64 prev_growth)
> >  {
> > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > +
> >         win->start_time = now;
> >         win->start_value = value;
> >         win->prev_growth = prev_growth;
> > +       t->new_stall = false;
> >  }
> >
> >  /*
> > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> >  static u64 update_triggers(struct psi_group *group, u64 now)
> >  {
> >         struct psi_trigger *t;
> > -       bool new_stall = false;
> >         u64 *total = group->total[PSI_POLL];
> >
> >         /*
> > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> >          * watchers know when their specified thresholds are exceeded.
> >          */
> >         list_for_each_entry(t, &group->triggers, node) {
> > -               u64 growth;
> > -
> >                 /* Check for stall activity */
> >                 if (group->polling_total[t->state] == total[t->state])
> >                         continue;
> >
> >                 /*
> > -                * Multiple triggers might be looking at the same state,
> > -                * remember to update group->polling_total[] once we've
> > -                * been through all of them. Also remember to extend the
> > -                * polling time if we see new stall activity.
> > +                * update the trigger if there is new stall which will be
> > +                * reset when run out of the window
> >                  */
> > -               new_stall = true;
> > +               t->new_stall = true;
> > +
> > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > +                               sizeof(group->polling_total[t->state]));
> > +       }
> > +
> > +       list_for_each_entry(t, &group->triggers, node) {
> > +               u64 growth;
> > +
> > +               /* check if new stall happened during this window*/
> > +               if (!t->new_stall)
> > +                       continue;
> >
> >                 /* Calculate growth since last update */
> >                 growth = window_update(&t->win, now, total[t->state]);
> > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> >                 t->last_event_time = now;
> >         }
> >
> > -       if (new_stall)
> > -               memcpy(group->polling_total, total,
> > -                               sizeof(group->polling_total));
> > -
> >         return now + group->poll_min_period;
> >  }
> >
> > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> >         t->last_event_time = 0;
> >         init_waitqueue_head(&t->event_wait);
> >         kref_init(&t->refcount);
> > +       t->new_stall = false;
> >
> >         mutex_lock(&group->trigger_lock);
> >
> > --
> > 1.9.1
> >
Zhaoyang Huang Dec. 21, 2021, 1:56 a.m. UTC | #3
On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > loop Suren
>
> Thanks.
>
>
> >
> > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > >
> > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > >
> > > There could be missing wake up if the rest of the window remain the
> > > same stall states as the polling_total updates for every polling_min_period.
>
> Could you please expand on this description? I'm unclear what the
> problem is. I assume "polling_min_period" in this description refers
> to the group->poll_min_period.
>
> From the code, looks like the change results in update_triggers()
> calling window_update() once there was a new stall recorded for the
> trigger state and until the tracking window is complete. I don't see
> the point of calling window_update() if there was no stall change
> since the last call to window_update(). The resulting growth will not
> increase if there is no new stall.
> Maybe what you want to achieve here is more than one trigger per
> window if the stall limit was breached? If so, then this goes against
> the design for psi triggers in which we want to rate-limit the number
> of generated triggers per tracking window (see:
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> Please clarify the issue and the intentions here.
> Thanks!
Please correct me if I am wrong. Imagine that there is a new stall
during the 1st polling_min_period among 10 of them in the window and
group->polling_total will be updated to total without trigger. If the
rest of 9 polling_min_periods remain the same states, the trigger will
be missed when window timing is reached.
>
> > >
> > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > ---
> > >  include/linux/psi_types.h |  2 ++
> > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > index 0a23300..9533d2e 100644
> > > --- a/include/linux/psi_types.h
> > > +++ b/include/linux/psi_types.h
> > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > >
> > >         /* Refcounting to prevent premature destruction */
> > >         struct kref refcount;
> > > +
> > > +       bool new_stall;
> > >  };
> > >
> > >  struct psi_group {
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index 1652f2b..402718c 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > >                          u64 prev_growth)
> > >  {
> > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > +
> > >         win->start_time = now;
> > >         win->start_value = value;
> > >         win->prev_growth = prev_growth;
> > > +       t->new_stall = false;
> > >  }
> > >
> > >  /*
> > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > >  {
> > >         struct psi_trigger *t;
> > > -       bool new_stall = false;
> > >         u64 *total = group->total[PSI_POLL];
> > >
> > >         /*
> > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > >          * watchers know when their specified thresholds are exceeded.
> > >          */
> > >         list_for_each_entry(t, &group->triggers, node) {
> > > -               u64 growth;
> > > -
> > >                 /* Check for stall activity */
> > >                 if (group->polling_total[t->state] == total[t->state])
> > >                         continue;
> > >
> > >                 /*
> > > -                * Multiple triggers might be looking at the same state,
> > > -                * remember to update group->polling_total[] once we've
> > > -                * been through all of them. Also remember to extend the
> > > -                * polling time if we see new stall activity.
> > > +                * update the trigger if there is new stall which will be
> > > +                * reset when run out of the window
> > >                  */
> > > -               new_stall = true;
> > > +               t->new_stall = true;
> > > +
> > > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > > +                               sizeof(group->polling_total[t->state]));
> > > +       }
> > > +
> > > +       list_for_each_entry(t, &group->triggers, node) {
> > > +               u64 growth;
> > > +
> > > +               /* check if new stall happened during this window*/
> > > +               if (!t->new_stall)
> > > +                       continue;
> > >
> > >                 /* Calculate growth since last update */
> > >                 growth = window_update(&t->win, now, total[t->state]);
> > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > >                 t->last_event_time = now;
> > >         }
> > >
> > > -       if (new_stall)
> > > -               memcpy(group->polling_total, total,
> > > -                               sizeof(group->polling_total));
> > > -
> > >         return now + group->poll_min_period;
> > >  }
> > >
> > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >         t->last_event_time = 0;
> > >         init_waitqueue_head(&t->event_wait);
> > >         kref_init(&t->refcount);
> > > +       t->new_stall = false;
> > >
> > >         mutex_lock(&group->trigger_lock);
> > >
> > > --
> > > 1.9.1
> > >
Suren Baghdasaryan Dec. 21, 2021, 2:30 a.m. UTC | #4
On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > >
> > > loop Suren
> >
> > Thanks.
> >
> >
> > >
> > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > >
> > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > >
> > > > There could be missing wake up if the rest of the window remain the
> > > > same stall states as the polling_total updates for every polling_min_period.
> >
> > Could you please expand on this description? I'm unclear what the
> > problem is. I assume "polling_min_period" in this description refers
> > to the group->poll_min_period.
> >
> > From the code, looks like the change results in update_triggers()
> > calling window_update() once there was a new stall recorded for the
> > trigger state and until the tracking window is complete. I don't see
> > the point of calling window_update() if there was no stall change
> > since the last call to window_update(). The resulting growth will not
> > increase if there is no new stall.
> > Maybe what you want to achieve here is more than one trigger per
> > window if the stall limit was breached? If so, then this goes against
> > the design for psi triggers in which we want to rate-limit the number
> > of generated triggers per tracking window (see:
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > Please clarify the issue and the intentions here.
> > Thanks!
> Please correct me if I am wrong. Imagine that there is a new stall
> during the 1st polling_min_period among 10 of them in the window and
> group->polling_total will be updated to total without trigger. If the
> rest of 9 polling_min_periods remain the same states, the trigger will
> be missed when window timing is reached.

I don't see why updating group->polling_total after the first
registered stall is an issue here. window_update() calculates growth
using current group->total[] and win->start_value (see:
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
which is set at the beginning of the window (see:
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
If the calculated growth did not reach t->threshold then the trigger
should not be fired (see:
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
We fire the trigger only if growth within a given window is higher
than the threshold.

In your scenario if the stall recorded in the 1st polling_min_period
was less than the threshold and in the other 9 polling_min_periods no
new stalls were registered then there should be no triggers fired in
that window. This is intended behavior. Trigger is fired only when the
recorded stall within the window breaches the threshold. And there
will be only one trigger generated per window, no matter how much
stall is being recorded after the threshold was breached.
Hopefully this clarifies the behavior?

> >
> > > >
> > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > ---
> > > >  include/linux/psi_types.h |  2 ++
> > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > index 0a23300..9533d2e 100644
> > > > --- a/include/linux/psi_types.h
> > > > +++ b/include/linux/psi_types.h
> > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > >
> > > >         /* Refcounting to prevent premature destruction */
> > > >         struct kref refcount;
> > > > +
> > > > +       bool new_stall;
> > > >  };
> > > >
> > > >  struct psi_group {
> > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > index 1652f2b..402718c 100644
> > > > --- a/kernel/sched/psi.c
> > > > +++ b/kernel/sched/psi.c
> > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > >                          u64 prev_growth)
> > > >  {
> > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > +
> > > >         win->start_time = now;
> > > >         win->start_value = value;
> > > >         win->prev_growth = prev_growth;
> > > > +       t->new_stall = false;
> > > >  }
> > > >
> > > >  /*
> > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > >  {
> > > >         struct psi_trigger *t;
> > > > -       bool new_stall = false;
> > > >         u64 *total = group->total[PSI_POLL];
> > > >
> > > >         /*
> > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > >          * watchers know when their specified thresholds are exceeded.
> > > >          */
> > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > -               u64 growth;
> > > > -
> > > >                 /* Check for stall activity */
> > > >                 if (group->polling_total[t->state] == total[t->state])
> > > >                         continue;
> > > >
> > > >                 /*
> > > > -                * Multiple triggers might be looking at the same state,
> > > > -                * remember to update group->polling_total[] once we've
> > > > -                * been through all of them. Also remember to extend the
> > > > -                * polling time if we see new stall activity.
> > > > +                * update the trigger if there is new stall which will be
> > > > +                * reset when run out of the window
> > > >                  */
> > > > -               new_stall = true;
> > > > +               t->new_stall = true;
> > > > +
> > > > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > > > +                               sizeof(group->polling_total[t->state]));
> > > > +       }
> > > > +
> > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > +               u64 growth;
> > > > +
> > > > +               /* check if new stall happened during this window*/
> > > > +               if (!t->new_stall)
> > > > +                       continue;
> > > >
> > > >                 /* Calculate growth since last update */
> > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > >                 t->last_event_time = now;
> > > >         }
> > > >
> > > > -       if (new_stall)
> > > > -               memcpy(group->polling_total, total,
> > > > -                               sizeof(group->polling_total));
> > > > -
> > > >         return now + group->poll_min_period;
> > > >  }
> > > >
> > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > >         t->last_event_time = 0;
> > > >         init_waitqueue_head(&t->event_wait);
> > > >         kref_init(&t->refcount);
> > > > +       t->new_stall = false;
> > > >
> > > >         mutex_lock(&group->trigger_lock);
> > > >
> > > > --
> > > > 1.9.1
> > > >
Zhaoyang Huang Dec. 21, 2021, 2:51 a.m. UTC | #5
On Tue, Dec 21, 2021 at 10:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > >
> > > > loop Suren
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > >
> > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > >
> > > > > There could be missing wake up if the rest of the window remain the
> > > > > same stall states as the polling_total updates for every polling_min_period.
> > >
> > > Could you please expand on this description? I'm unclear what the
> > > problem is. I assume "polling_min_period" in this description refers
> > > to the group->poll_min_period.
> > >
> > > From the code, looks like the change results in update_triggers()
> > > calling window_update() once there was a new stall recorded for the
> > > trigger state and until the tracking window is complete. I don't see
> > > the point of calling window_update() if there was no stall change
> > > since the last call to window_update(). The resulting growth will not
> > > increase if there is no new stall.
> > > Maybe what you want to achieve here is more than one trigger per
> > > window if the stall limit was breached? If so, then this goes against
> > > the design for psi triggers in which we want to rate-limit the number
> > > of generated triggers per tracking window (see:
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > > Please clarify the issue and the intentions here.
> > > Thanks!
> > Please correct me if I am wrong. Imagine that there is a new stall
> > during the 1st polling_min_period among 10 of them in the window and
> > group->polling_total will be updated to total without trigger. If the
> > rest of 9 polling_min_periods remain the same states, the trigger will
> > be missed when window timing is reached.
>
> I don't see why updating group->polling_total after the first
> registered stall is an issue here. window_update() calculates growth
> using current group->total[] and win->start_value (see:
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
> which is set at the beginning of the window (see:
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
> If the calculated growth did not reach t->threshold then the trigger
> should not be fired (see:
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
> We fire the trigger only if growth within a given window is higher
> than the threshold.
>
> In your scenario if the stall recorded in the 1st polling_min_period
> was less than the threshold and in the other 9 polling_min_periods no
> new stalls were registered then there should be no triggers fired in
> that window. This is intended behavior. Trigger is fired only when the
The stall in the 1st polling_min_period was *LARGE* then the threshold
will also be ignored here.

> recorded stall within the window breaches the threshold. And there
> will be only one trigger generated per window, no matter how much
> stall is being recorded after the threshold was breached.
> Hopefully this clarifies the behavior?

I don't think so. According to your opinion, if the total keeps no
change in the last polling_min_period, then the growth during the 1-9
min_periods which is much larger than the threshold will also be
ignored. It does not make sense.

https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L529
>
> > >
> > > > >
> > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > ---
> > > > >  include/linux/psi_types.h |  2 ++
> > > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > > index 0a23300..9533d2e 100644
> > > > > --- a/include/linux/psi_types.h
> > > > > +++ b/include/linux/psi_types.h
> > > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > > >
> > > > >         /* Refcounting to prevent premature destruction */
> > > > >         struct kref refcount;
> > > > > +
> > > > > +       bool new_stall;
> > > > >  };
> > > > >
> > > > >  struct psi_group {
> > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > index 1652f2b..402718c 100644
> > > > > --- a/kernel/sched/psi.c
> > > > > +++ b/kernel/sched/psi.c
> > > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > > >                          u64 prev_growth)
> > > > >  {
> > > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > > +
> > > > >         win->start_time = now;
> > > > >         win->start_value = value;
> > > > >         win->prev_growth = prev_growth;
> > > > > +       t->new_stall = false;
> > > > >  }
> > > > >
> > > > >  /*
> > > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > > >  {
> > > > >         struct psi_trigger *t;
> > > > > -       bool new_stall = false;
> > > > >         u64 *total = group->total[PSI_POLL];
> > > > >
> > > > >         /*
> > > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > >          * watchers know when their specified thresholds are exceeded.
> > > > >          */
> > > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > > -               u64 growth;
> > > > > -
> > > > >                 /* Check for stall activity */
> > > > >                 if (group->polling_total[t->state] == total[t->state])
> > > > >                         continue;
> > > > >
> > > > >                 /*
> > > > > -                * Multiple triggers might be looking at the same state,
> > > > > -                * remember to update group->polling_total[] once we've
> > > > > -                * been through all of them. Also remember to extend the
> > > > > -                * polling time if we see new stall activity.
> > > > > +                * update the trigger if there is new stall which will be
> > > > > +                * reset when run out of the window
> > > > >                  */
> > > > > -               new_stall = true;
> > > > > +               t->new_stall = true;
> > > > > +
> > > > > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > > > > +                               sizeof(group->polling_total[t->state]));
> > > > > +       }
> > > > > +
> > > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > > +               u64 growth;
> > > > > +
> > > > > +               /* check if new stall happened during this window*/
> > > > > +               if (!t->new_stall)
> > > > > +                       continue;
> > > > >
> > > > >                 /* Calculate growth since last update */
> > > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > >                 t->last_event_time = now;
> > > > >         }
> > > > >
> > > > > -       if (new_stall)
> > > > > -               memcpy(group->polling_total, total,
> > > > > -                               sizeof(group->polling_total));
> > > > > -
> > > > >         return now + group->poll_min_period;
> > > > >  }
> > > > >
> > > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > >         t->last_event_time = 0;
> > > > >         init_waitqueue_head(&t->event_wait);
> > > > >         kref_init(&t->refcount);
> > > > > +       t->new_stall = false;
> > > > >
> > > > >         mutex_lock(&group->trigger_lock);
> > > > >
> > > > > --
> > > > > 1.9.1
> > > > >
Suren Baghdasaryan Dec. 21, 2021, 3 a.m. UTC | #6
On Mon, Dec 20, 2021 at 6:51 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Tue, Dec 21, 2021 at 10:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > >
> > > On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > >
> > > > > loop Suren
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > >
> > > > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > > >
> > > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > >
> > > > > > There could be missing wake up if the rest of the window remain the
> > > > > > same stall states as the polling_total updates for every polling_min_period.
> > > >
> > > > Could you please expand on this description? I'm unclear what the
> > > > problem is. I assume "polling_min_period" in this description refers
> > > > to the group->poll_min_period.
> > > >
> > > > From the code, looks like the change results in update_triggers()
> > > > calling window_update() once there was a new stall recorded for the
> > > > trigger state and until the tracking window is complete. I don't see
> > > > the point of calling window_update() if there was no stall change
> > > > since the last call to window_update(). The resulting growth will not
> > > > increase if there is no new stall.
> > > > Maybe what you want to achieve here is more than one trigger per
> > > > window if the stall limit was breached? If so, then this goes against
> > > > the design for psi triggers in which we want to rate-limit the number
> > > > of generated triggers per tracking window (see:
> > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > > > Please clarify the issue and the intentions here.
> > > > Thanks!
> > > Please correct me if I am wrong. Imagine that there is a new stall
> > > during the 1st polling_min_period among 10 of them in the window and
> > > group->polling_total will be updated to total without trigger. If the
> > > rest of 9 polling_min_periods remain the same states, the trigger will
> > > be missed when window timing is reached.
> >
> > I don't see why updating group->polling_total after the first
> > registered stall is an issue here. window_update() calculates growth
> > using current group->total[] and win->start_value (see:
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
> > which is set at the beginning of the window (see:
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
> > If the calculated growth did not reach t->threshold then the trigger
> > should not be fired (see:
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
> > We fire the trigger only if growth within a given window is higher
> > than the threshold.
> >
> > In your scenario if the stall recorded in the 1st polling_min_period
> > was less than the threshold and in the other 9 polling_min_periods no
> > new stalls were registered then there should be no triggers fired in
> > that window. This is intended behavior. Trigger is fired only when the
> The stall in the 1st polling_min_period was *LARGE* then the threshold
> will also be ignored here.

Ok, in that case is it ignored due to this condition:
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
?
If so then I see what the problem might be. Please confirm.

>
> > recorded stall within the window breaches the threshold. And there
> > will be only one trigger generated per window, no matter how much
> > stall is being recorded after the threshold was breached.
> > Hopefully this clarifies the behavior?
>
> I don't think so. According to your opinion, if the total keeps no
> change in the last polling_min_period, then the growth during the 1-9
> min_periods which is much larger than the threshold will also be
> ignored. It does not make sense.
>
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L529
> >
> > > >
> > > > > >
> > > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > ---
> > > > > >  include/linux/psi_types.h |  2 ++
> > > > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > > > >
> > > > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > > > index 0a23300..9533d2e 100644
> > > > > > --- a/include/linux/psi_types.h
> > > > > > +++ b/include/linux/psi_types.h
> > > > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > > > >
> > > > > >         /* Refcounting to prevent premature destruction */
> > > > > >         struct kref refcount;
> > > > > > +
> > > > > > +       bool new_stall;
> > > > > >  };
> > > > > >
> > > > > >  struct psi_group {
> > > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > > index 1652f2b..402718c 100644
> > > > > > --- a/kernel/sched/psi.c
> > > > > > +++ b/kernel/sched/psi.c
> > > > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > > > >                          u64 prev_growth)
> > > > > >  {
> > > > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > > > +
> > > > > >         win->start_time = now;
> > > > > >         win->start_value = value;
> > > > > >         win->prev_growth = prev_growth;
> > > > > > +       t->new_stall = false;
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > >  {
> > > > > >         struct psi_trigger *t;
> > > > > > -       bool new_stall = false;
> > > > > >         u64 *total = group->total[PSI_POLL];
> > > > > >
> > > > > >         /*
> > > > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > >          * watchers know when their specified thresholds are exceeded.
> > > > > >          */
> > > > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > > > -               u64 growth;
> > > > > > -
> > > > > >                 /* Check for stall activity */
> > > > > >                 if (group->polling_total[t->state] == total[t->state])
> > > > > >                         continue;
> > > > > >
> > > > > >                 /*
> > > > > > -                * Multiple triggers might be looking at the same state,
> > > > > > -                * remember to update group->polling_total[] once we've
> > > > > > -                * been through all of them. Also remember to extend the
> > > > > > -                * polling time if we see new stall activity.
> > > > > > +                * update the trigger if there is new stall which will be
> > > > > > +                * reset when run out of the window
> > > > > >                  */
> > > > > > -               new_stall = true;
> > > > > > +               t->new_stall = true;
> > > > > > +
> > > > > > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > > > > > +                               sizeof(group->polling_total[t->state]));
> > > > > > +       }
> > > > > > +
> > > > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > > > +               u64 growth;
> > > > > > +
> > > > > > +               /* check if new stall happened during this window*/
> > > > > > +               if (!t->new_stall)
> > > > > > +                       continue;
> > > > > >
> > > > > >                 /* Calculate growth since last update */
> > > > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > >                 t->last_event_time = now;
> > > > > >         }
> > > > > >
> > > > > > -       if (new_stall)
> > > > > > -               memcpy(group->polling_total, total,
> > > > > > -                               sizeof(group->polling_total));
> > > > > > -
> > > > > >         return now + group->poll_min_period;
> > > > > >  }
> > > > > >
> > > > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > > >         t->last_event_time = 0;
> > > > > >         init_waitqueue_head(&t->event_wait);
> > > > > >         kref_init(&t->refcount);
> > > > > > +       t->new_stall = false;
> > > > > >
> > > > > >         mutex_lock(&group->trigger_lock);
> > > > > >
> > > > > > --
> > > > > > 1.9.1
> > > > > >
Zhaoyang Huang Dec. 21, 2021, 3:07 a.m. UTC | #7
On Tue, Dec 21, 2021 at 11:00 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Dec 20, 2021 at 6:51 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > On Tue, Dec 21, 2021 at 10:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > >
> > > > On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > > >
> > > > > > loop Suren
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > > >
> > > > > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > > > >
> > > > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > >
> > > > > > > There could be missing wake up if the rest of the window remain the
> > > > > > > same stall states as the polling_total updates for every polling_min_period.
> > > > >
> > > > > Could you please expand on this description? I'm unclear what the
> > > > > problem is. I assume "polling_min_period" in this description refers
> > > > > to the group->poll_min_period.
> > > > >
> > > > > From the code, looks like the change results in update_triggers()
> > > > > calling window_update() once there was a new stall recorded for the
> > > > > trigger state and until the tracking window is complete. I don't see
> > > > > the point of calling window_update() if there was no stall change
> > > > > since the last call to window_update(). The resulting growth will not
> > > > > increase if there is no new stall.
> > > > > Maybe what you want to achieve here is more than one trigger per
> > > > > window if the stall limit was breached? If so, then this goes against
> > > > > the design for psi triggers in which we want to rate-limit the number
> > > > > of generated triggers per tracking window (see:
> > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > > > > Please clarify the issue and the intentions here.
> > > > > Thanks!
> > > > Please correct me if I am wrong. Imagine that there is a new stall
> > > > during the 1st polling_min_period among 10 of them in the window and
> > > > group->polling_total will be updated to total without trigger. If the
> > > > rest of 9 polling_min_periods remain the same states, the trigger will
> > > > be missed when window timing is reached.
> > >
> > > I don't see why updating group->polling_total after the first
> > > registered stall is an issue here. window_update() calculates growth
> > > using current group->total[] and win->start_value (see:
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
> > > which is set at the beginning of the window (see:
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
> > > If the calculated growth did not reach t->threshold then the trigger
> > > should not be fired (see:
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
> > > We fire the trigger only if growth within a given window is higher
> > > than the threshold.
> > >
> > > In your scenario if the stall recorded in the 1st polling_min_period
> > > was less than the threshold and in the other 9 polling_min_periods no
> > > new stalls were registered then there should be no triggers fired in
> > > that window. This is intended behavior. Trigger is fired only when the
> > The stall in the 1st polling_min_period was *LARGE* then the threshold
> > will also be ignored here.
>
> Ok, in that case is it ignored due to this condition:
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
> ?
> If so then I see what the problem might be. Please confirm.
Yes.
Actually, we found LMKD is driven by the inside polling_intervals
which lose the trigger from PSI.
>
> >
> > > recorded stall within the window breaches the threshold. And there
> > > will be only one trigger generated per window, no matter how much
> > > stall is being recorded after the threshold was breached.
> > > Hopefully this clarifies the behavior?
> >
> > I don't think so. According to your opinion, if the total keeps no
> > change in the last polling_min_period, then the growth during the 1-9
> > min_periods which is much larger than the threshold will also be
> > ignored. It does not make sense.
> >
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L529
> > >
> > > > >
> > > > > > >
> > > > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > > ---
> > > > > > >  include/linux/psi_types.h |  2 ++
> > > > > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > > > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > > > > index 0a23300..9533d2e 100644
> > > > > > > --- a/include/linux/psi_types.h
> > > > > > > +++ b/include/linux/psi_types.h
> > > > > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > > > > >
> > > > > > >         /* Refcounting to prevent premature destruction */
> > > > > > >         struct kref refcount;
> > > > > > > +
> > > > > > > +       bool new_stall;
> > > > > > >  };
> > > > > > >
> > > > > > >  struct psi_group {
> > > > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > > > index 1652f2b..402718c 100644
> > > > > > > --- a/kernel/sched/psi.c
> > > > > > > +++ b/kernel/sched/psi.c
> > > > > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > > > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > > > > >                          u64 prev_growth)
> > > > > > >  {
> > > > > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > > > > +
> > > > > > >         win->start_time = now;
> > > > > > >         win->start_value = value;
> > > > > > >         win->prev_growth = prev_growth;
> > > > > > > +       t->new_stall = false;
> > > > > > >  }
> > > > > > >
> > > > > > >  /*
> > > > > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > > > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > >  {
> > > > > > >         struct psi_trigger *t;
> > > > > > > -       bool new_stall = false;
> > > > > > >         u64 *total = group->total[PSI_POLL];
> > > > > > >
> > > > > > >         /*
> > > > > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > >          * watchers know when their specified thresholds are exceeded.
> > > > > > >          */
> > > > > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > > > > -               u64 growth;
> > > > > > > -
> > > > > > >                 /* Check for stall activity */
> > > > > > >                 if (group->polling_total[t->state] == total[t->state])
> > > > > > >                         continue;
> > > > > > >
> > > > > > >                 /*
> > > > > > > -                * Multiple triggers might be looking at the same state,
> > > > > > > -                * remember to update group->polling_total[] once we've
> > > > > > > -                * been through all of them. Also remember to extend the
> > > > > > > -                * polling time if we see new stall activity.
> > > > > > > +                * update the trigger if there is new stall which will be
> > > > > > > +                * reset when run out of the window
> > > > > > >                  */
> > > > > > > -               new_stall = true;
> > > > > > > +               t->new_stall = true;
> > > > > > > +
> > > > > > > +               memcpy(&group->polling_total[t->state], &total[t->state],
> > > > > > > +                               sizeof(group->polling_total[t->state]));
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > > > > +               u64 growth;
> > > > > > > +
> > > > > > > +               /* check if new stall happened during this window*/
> > > > > > > +               if (!t->new_stall)
> > > > > > > +                       continue;
> > > > > > >
> > > > > > >                 /* Calculate growth since last update */
> > > > > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > >                 t->last_event_time = now;
> > > > > > >         }
> > > > > > >
> > > > > > > -       if (new_stall)
> > > > > > > -               memcpy(group->polling_total, total,
> > > > > > > -                               sizeof(group->polling_total));
> > > > > > > -
> > > > > > >         return now + group->poll_min_period;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > > > >         t->last_event_time = 0;
> > > > > > >         init_waitqueue_head(&t->event_wait);
> > > > > > >         kref_init(&t->refcount);
> > > > > > > +       t->new_stall = false;
> > > > > > >
> > > > > > >         mutex_lock(&group->trigger_lock);
> > > > > > >
> > > > > > > --
> > > > > > > 1.9.1
> > > > > > >
Suren Baghdasaryan Dec. 21, 2021, 6:37 a.m. UTC | #8
On Mon, Dec 20, 2021 at 7:08 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Tue, Dec 21, 2021 at 11:00 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, Dec 20, 2021 at 6:51 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > >
> > > On Tue, Dec 21, 2021 at 10:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > >
> > > > > On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > > > >
> > > > > > > loop Suren
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > > > > >
> > > > > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > > >
> > > > > > > > There could be missing wake up if the rest of the window remain the
> > > > > > > > same stall states as the polling_total updates for every polling_min_period.
> > > > > >
> > > > > > Could you please expand on this description? I'm unclear what the
> > > > > > problem is. I assume "polling_min_period" in this description refers
> > > > > > to the group->poll_min_period.
> > > > > >
> > > > > > From the code, looks like the change results in update_triggers()
> > > > > > calling window_update() once there was a new stall recorded for the
> > > > > > trigger state and until the tracking window is complete. I don't see
> > > > > > the point of calling window_update() if there was no stall change
> > > > > > since the last call to window_update(). The resulting growth will not
> > > > > > increase if there is no new stall.
> > > > > > Maybe what you want to achieve here is more than one trigger per
> > > > > > window if the stall limit was breached? If so, then this goes against
> > > > > > the design for psi triggers in which we want to rate-limit the number
> > > > > > of generated triggers per tracking window (see:
> > > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > > > > > Please clarify the issue and the intentions here.
> > > > > > Thanks!
> > > > > Please correct me if I am wrong. Imagine that there is a new stall
> > > > > during the 1st polling_min_period among 10 of them in the window and
> > > > > group->polling_total will be updated to total without trigger. If the
> > > > > rest of 9 polling_min_periods remain the same states, the trigger will
> > > > > be missed when window timing is reached.
> > > >
> > > > I don't see why updating group->polling_total after the first
> > > > registered stall is an issue here. window_update() calculates growth
> > > > using current group->total[] and win->start_value (see:
> > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
> > > > which is set at the beginning of the window (see:
> > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
> > > > If the calculated growth did not reach t->threshold then the trigger
> > > > should not be fired (see:
> > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
> > > > We fire the trigger only if growth within a given window is higher
> > > > than the threshold.
> > > >
> > > > In your scenario if the stall recorded in the 1st polling_min_period
> > > > was less than the threshold and in the other 9 polling_min_periods no
> > > > new stalls were registered then there should be no triggers fired in
> > > > that window. This is intended behavior. Trigger is fired only when the
> > > The stall in the 1st polling_min_period was *LARGE* then the threshold
> > > will also be ignored here.
> >
> > Ok, in that case is it ignored due to this condition:
> > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
> > ?
> > If so then I see what the problem might be. Please confirm.
> Yes.

Ok, I understand the issue now. Couple problems with your approach, see inline:

> Actually, we found LMKD is driven by the inside polling_intervals
> which lose the trigger from PSI.
> >
> > >
> > > > recorded stall within the window breaches the threshold. And there
> > > > will be only one trigger generated per window, no matter how much
> > > > stall is being recorded after the threshold was breached.
> > > > Hopefully this clarifies the behavior?
> > >
> > > I don't think so. According to your opinion, if the total keeps no
> > > change in the last polling_min_period, then the growth during the 1-9
> > > min_periods which is much larger than the threshold will also be
> > > ignored. It does not make sense.
> > >
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L529
> > > >
> > > > > >
> > > > > > > >
> > > > > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > > > ---
> > > > > > > >  include/linux/psi_types.h |  2 ++
> > > > > > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > > > > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > > > > > index 0a23300..9533d2e 100644
> > > > > > > > --- a/include/linux/psi_types.h
> > > > > > > > +++ b/include/linux/psi_types.h
> > > > > > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > > > > > >
> > > > > > > >         /* Refcounting to prevent premature destruction */
> > > > > > > >         struct kref refcount;
> > > > > > > > +
> > > > > > > > +       bool new_stall;
> > > > > > > >  };
> > > > > > > >
> > > > > > > >  struct psi_group {
> > > > > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > > > > index 1652f2b..402718c 100644
> > > > > > > > --- a/kernel/sched/psi.c
> > > > > > > > +++ b/kernel/sched/psi.c
> > > > > > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > > > > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > > > > > >                          u64 prev_growth)
> > > > > > > >  {
> > > > > > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > > > > > +
> > > > > > > >         win->start_time = now;
> > > > > > > >         win->start_value = value;
> > > > > > > >         win->prev_growth = prev_growth;
> > > > > > > > +       t->new_stall = false;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > > > > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > >  {
> > > > > > > >         struct psi_trigger *t;
> > > > > > > > -       bool new_stall = false;
> > > > > > > >         u64 *total = group->total[PSI_POLL];
> > > > > > > >
> > > > > > > >         /*
> > > > > > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > >          * watchers know when their specified thresholds are exceeded.
> > > > > > > >          */
> > > > > > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > > > > > -               u64 growth;
> > > > > > > > -
> > > > > > > >                 /* Check for stall activity */
> > > > > > > >                 if (group->polling_total[t->state] == total[t->state])
> > > > > > > >                         continue;
> > > > > > > >
> > > > > > > >                 /*
> > > > > > > > -                * Multiple triggers might be looking at the same state,
> > > > > > > > -                * remember to update group->polling_total[] once we've
> > > > > > > > -                * been through all of them. Also remember to extend the
> > > > > > > > -                * polling time if we see new stall activity.
> > > > > > > > +                * update the trigger if there is new stall which will be
> > > > > > > > +                * reset when run out of the window
> > > > > > > >                  */
> > > > > > > > -               new_stall = true;
> > > > > > > > +               t->new_stall = true;
> > > > > > > > +
> > > > > > > > +               memcpy(&group->polling_total[t->state], &total [t->state],
> > > > > > > > +                               sizeof(group->polling_total[t->state]));

If you reset group->polling_total[t->state] here then if there is
another trigger t2 in the group->triggers so that t->state ==
t2->state then t2->new_stall will never be set. That's what the
"Multiple triggers..." comment which you deleted was warning about.
BTW, the above memcpy() can be replaced with a simple assignment:
               group->polling_total[t->state] = total[t->state];


> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > > > > > +               u64 growth;
> > > > > > > > +
> > > > > > > > +               /* check if new stall happened during this window*/
> > > > > > > > +               if (!t->new_stall)
> > > > > > > > +                       continue;

With this check, once t->new_stall was set to true, window_update()
will be called every time update_triggers() is called, even if there
is no new stall for multiple periods. This is suboptimal. I would
rather remember that we skipped event generation here
https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
and catch up later. Something like this (untested):

--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -130,6 +130,9 @@ struct psi_trigger {
  */
  u64 last_event_time;

+ /* Flag set when threshold is breached but event was rate-limited */
+ bool threshold_breach;
+
  /* Refcounting to prevent premature destruction */
  struct kref refcount;
 };
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..301b31e860ef 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -524,23 +524,29 @@ static u64 update_triggers(struct psi_group
*group, u64 now)
  */
  list_for_each_entry(t, &group->triggers, node) {
  u64 growth;
+ bool trigger_stalled =
+ group->polling_total[t->state] != total[t->state];

- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
+ /* Check for stall activity or a previous threshold breach */
+ if (!trigger_stalled && !t->threshold_breach)
  continue;

- /*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
- */
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
+ if (trigger_stalled) {
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ new_stall = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ t->threshold_breach = true;
+ }

  /* Limit event signaling to once per window */
  if (now < t->last_event_time + t->win.size)
@@ -550,6 +556,8 @@ static u64 update_triggers(struct psi_group *group, u64 now)
  if (cmpxchg(&t->event, 0, 1) == 0)
  wake_up_interruptible(&t->event_wait);
  t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->threshold_breach = false;
  }

  if (new_stall)
@@ -1150,6 +1158,7 @@ struct psi_trigger *psi_trigger_create(struct
psi_group *group,

  t->event = 0;
  t->last_event_time = 0;
+ t->threshold_breach = false;
  init_waitqueue_head(&t->event_wait);
  kref_init(&t->refcount);


> > > > > > > >
> > > > > > > >                 /* Calculate growth since last update */
> > > > > > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > > > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > >                 t->last_event_time = now;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -       if (new_stall)
> > > > > > > > -               memcpy(group->polling_total, total,
> > > > > > > > -                               sizeof(group->polling_total));
> > > > > > > > -
> > > > > > > >         return now + group->poll_min_period;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > > > > >         t->last_event_time = 0;
> > > > > > > >         init_waitqueue_head(&t->event_wait);
> > > > > > > >         kref_init(&t->refcount);
> > > > > > > > +       t->new_stall = false;
> > > > > > > >
> > > > > > > >         mutex_lock(&group->trigger_lock);
> > > > > > > >
> > > > > > > > --
> > > > > > > > 1.9.1
> > > > > > > >
Suren Baghdasaryan Dec. 21, 2021, 6:40 a.m. UTC | #9
On Mon, Dec 20, 2021 at 10:37 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Dec 20, 2021 at 7:08 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > On Tue, Dec 21, 2021 at 11:00 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Mon, Dec 20, 2021 at 6:51 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > >
> > > > On Tue, Dec 21, 2021 at 10:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Mon, Dec 20, 2021 at 5:57 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Dec 21, 2021 at 3:58 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > >
> > > > > > > On Fri, Dec 17, 2021 at 10:03 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > > > > > > >
> > > > > > > > loop Suren
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > On Fri, Dec 17, 2021 at 2:08 PM Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > > > >
> > > > > > > > > There could be missing wake up if the rest of the window remain the
> > > > > > > > > same stall states as the polling_total updates for every polling_min_period.
> > > > > > >
> > > > > > > Could you please expand on this description? I'm unclear what the
> > > > > > > problem is. I assume "polling_min_period" in this description refers
> > > > > > > to the group->poll_min_period.
> > > > > > >
> > > > > > > From the code, looks like the change results in update_triggers()
> > > > > > > calling window_update() once there was a new stall recorded for the
> > > > > > > trigger state and until the tracking window is complete. I don't see
> > > > > > > the point of calling window_update() if there was no stall change
> > > > > > > since the last call to window_update(). The resulting growth will not
> > > > > > > increase if there is no new stall.
> > > > > > > Maybe what you want to achieve here is more than one trigger per
> > > > > > > window if the stall limit was breached? If so, then this goes against
> > > > > > > the design for psi triggers in which we want to rate-limit the number
> > > > > > > of generated triggers per tracking window (see:
> > > > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L545).
> > > > > > > Please clarify the issue and the intentions here.
> > > > > > > Thanks!
> > > > > > Please correct me if I am wrong. Imagine that there is a new stall
> > > > > > during the 1st polling_min_period among 10 of them in the window and
> > > > > > group->polling_total will be updated to total without trigger. If the
> > > > > > rest of 9 polling_min_periods remain the same states, the trigger will
> > > > > > be missed when window timing is reached.
> > > > >
> > > > > I don't see why updating group->polling_total after the first
> > > > > registered stall is an issue here. window_update() calculates growth
> > > > > using current group->total[] and win->start_value (see:
> > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L483)
> > > > > which is set at the beginning of the window (see:
> > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L462).
> > > > > If the calculated growth did not reach t->threshold then the trigger
> > > > > should not be fired (see:
> > > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L542).
> > > > > We fire the trigger only if growth within a given window is higher
> > > > > than the threshold.
> > > > >
> > > > > In your scenario if the stall recorded in the 1st polling_min_period
> > > > > was less than the threshold and in the other 9 polling_min_periods no
> > > > > new stalls were registered then there should be no triggers fired in
> > > > > that window. This is intended behavior. Trigger is fired only when the
> > > > The stall in the 1st polling_min_period was *LARGE* then the threshold
> > > > will also be ignored here.
> > >
> > > Ok, in that case is it ignored due to this condition:
> > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
> > > ?
> > > If so then I see what the problem might be. Please confirm.
> > Yes.
>
> Ok, I understand the issue now. Couple problems with your approach, see inline:
>
> > Actually, we found LMKD is driven by the inside polling_intervals
> > which lose the trigger from PSI.
> > >
> > > >
> > > > > recorded stall within the window breaches the threshold. And there
> > > > > will be only one trigger generated per window, no matter how much
> > > > > stall is being recorded after the threshold was breached.
> > > > > Hopefully this clarifies the behavior?
> > > >
> > > > I don't think so. According to your opinion, if the total keeps no
> > > > change in the last polling_min_period, then the growth during the 1-9
> > > > min_periods which is much larger than the threshold will also be
> > > > ignored. It does not make sense.
> > > >
> > > > https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L529
> > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > > > > ---
> > > > > > > > >  include/linux/psi_types.h |  2 ++
> > > > > > > > >  kernel/sched/psi.c        | 30 ++++++++++++++++++------------
> > > > > > > > >  2 files changed, 20 insertions(+), 12 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > > > > > > > > index 0a23300..9533d2e 100644
> > > > > > > > > --- a/include/linux/psi_types.h
> > > > > > > > > +++ b/include/linux/psi_types.h
> > > > > > > > > @@ -132,6 +132,8 @@ struct psi_trigger {
> > > > > > > > >
> > > > > > > > >         /* Refcounting to prevent premature destruction */
> > > > > > > > >         struct kref refcount;
> > > > > > > > > +
> > > > > > > > > +       bool new_stall;
> > > > > > > > >  };
> > > > > > > > >
> > > > > > > > >  struct psi_group {
> > > > > > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > > > > > index 1652f2b..402718c 100644
> > > > > > > > > --- a/kernel/sched/psi.c
> > > > > > > > > +++ b/kernel/sched/psi.c
> > > > > > > > > @@ -458,9 +458,12 @@ static void psi_avgs_work(struct work_struct *work)
> > > > > > > > >  static void window_reset(struct psi_window *win, u64 now, u64 value,
> > > > > > > > >                          u64 prev_growth)
> > > > > > > > >  {
> > > > > > > > > +       struct psi_trigger *t = container_of(win, struct psi_trigger, win);
> > > > > > > > > +
> > > > > > > > >         win->start_time = now;
> > > > > > > > >         win->start_value = value;
> > > > > > > > >         win->prev_growth = prev_growth;
> > > > > > > > > +       t->new_stall = false;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  /*
> > > > > > > > > @@ -515,7 +518,6 @@ static void init_triggers(struct psi_group *group, u64 now)
> > > > > > > > >  static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > > >  {
> > > > > > > > >         struct psi_trigger *t;
> > > > > > > > > -       bool new_stall = false;
> > > > > > > > >         u64 *total = group->total[PSI_POLL];
> > > > > > > > >
> > > > > > > > >         /*
> > > > > > > > > @@ -523,19 +525,26 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > > >          * watchers know when their specified thresholds are exceeded.
> > > > > > > > >          */
> > > > > > > > >         list_for_each_entry(t, &group->triggers, node) {
> > > > > > > > > -               u64 growth;
> > > > > > > > > -
> > > > > > > > >                 /* Check for stall activity */
> > > > > > > > >                 if (group->polling_total[t->state] == total[t->state])
> > > > > > > > >                         continue;
> > > > > > > > >
> > > > > > > > >                 /*
> > > > > > > > > -                * Multiple triggers might be looking at the same state,
> > > > > > > > > -                * remember to update group->polling_total[] once we've
> > > > > > > > > -                * been through all of them. Also remember to extend the
> > > > > > > > > -                * polling time if we see new stall activity.
> > > > > > > > > +                * update the trigger if there is new stall which will be
> > > > > > > > > +                * reset when run out of the window
> > > > > > > > >                  */
> > > > > > > > > -               new_stall = true;
> > > > > > > > > +               t->new_stall = true;
> > > > > > > > > +
> > > > > > > > > +               memcpy(&group->polling_total[t->state], &total [t->state],
> > > > > > > > > +                               sizeof(group->polling_total[t->state]));
>
> If you reset group->polling_total[t->state] here then if there is
> another trigger t2 in the group->triggers so that t->state ==
> t2->state then t2->new_stall will never be set. That's what the
> "Multiple triggers..." comment which you deleted was warning about.
> BTW, the above memcpy() can be replaced with a simple assignment:
>                group->polling_total[t->state] = total[t->state];
>
>
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +       list_for_each_entry(t, &group->triggers, node) {
> > > > > > > > > +               u64 growth;
> > > > > > > > > +
> > > > > > > > > +               /* check if new stall happened during this window*/
> > > > > > > > > +               if (!t->new_stall)
> > > > > > > > > +                       continue;
>
> With this check, once t->new_stall was set to true, window_update()
> will be called every time update_triggers() is called, even if there
> is no new stall for multiple periods. This is suboptimal. I would
> rather remember that we skipped event generation here
> https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L546
> and catch up later. Something like this (untested):
>
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -130,6 +130,9 @@ struct psi_trigger {
>   */
>   u64 last_event_time;
>
> + /* Flag set when threshold is breached but event was rate-limited */
> + bool threshold_breach;
> +
>   /* Refcounting to prevent premature destruction */
>   struct kref refcount;
>  };
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 1652f2bb54b7..301b31e860ef 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -524,23 +524,29 @@ static u64 update_triggers(struct psi_group
> *group, u64 now)
>   */
>   list_for_each_entry(t, &group->triggers, node) {
>   u64 growth;
> + bool trigger_stalled =
> + group->polling_total[t->state] != total[t->state];
>
> - /* Check for stall activity */
> - if (group->polling_total[t->state] == total[t->state])
> + /* Check for stall activity or a previous threshold breach */
> + if (!trigger_stalled && !t->threshold_breach)
>   continue;
>
> - /*
> - * Multiple triggers might be looking at the same state,
> - * remember to update group->polling_total[] once we've
> - * been through all of them. Also remember to extend the
> - * polling time if we see new stall activity.
> - */
> - new_stall = true;
> -
> - /* Calculate growth since last update */
> - growth = window_update(&t->win, now, total[t->state]);
> - if (growth < t->threshold)
> - continue;
> + if (trigger_stalled) {
> + /*
> + * Multiple triggers might be looking at the same state,
> + * remember to update group->polling_total[] once we've
> + * been through all of them. Also remember to extend the
> + * polling time if we see new stall activity.
> + */
> + new_stall = true;
> +
> + /* Calculate growth since last update */
> + growth = window_update(&t->win, now, total[t->state]);
> + if (growth < t->threshold)
> + continue;
> +
> + t->threshold_breach = true;
> + }
>
>   /* Limit event signaling to once per window */
>   if (now < t->last_event_time + t->win.size)
> @@ -550,6 +556,8 @@ static u64 update_triggers(struct psi_group *group, u64 now)
>   if (cmpxchg(&t->event, 0, 1) == 0)
>   wake_up_interruptible(&t->event_wait);
>   t->last_event_time = now;
> + /* Reset threshold breach flag once event got generated */
> + t->threshold_breach = false;
>   }
>
>   if (new_stall)
> @@ -1150,6 +1158,7 @@ struct psi_trigger *psi_trigger_create(struct
> psi_group *group,
>
>   t->event = 0;
>   t->last_event_time = 0;
> + t->threshold_breach = false;
>   init_waitqueue_head(&t->event_wait);
>   kref_init(&t->refcount);
>

Damn! I forgot that gmail loses all the tabs. Hopefully this
illustrates the idea but if not I can post it as a separate patch.

>
> > > > > > > > >
> > > > > > > > >                 /* Calculate growth since last update */
> > > > > > > > >                 growth = window_update(&t->win, now, total[t->state]);
> > > > > > > > > @@ -552,10 +561,6 @@ static u64 update_triggers(struct psi_group *group, u64 now)
> > > > > > > > >                 t->last_event_time = now;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -       if (new_stall)
> > > > > > > > > -               memcpy(group->polling_total, total,
> > > > > > > > > -                               sizeof(group->polling_total));
> > > > > > > > > -
> > > > > > > > >         return now + group->poll_min_period;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1152,6 +1157,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > > > > > >         t->last_event_time = 0;
> > > > > > > > >         init_waitqueue_head(&t->event_wait);
> > > > > > > > >         kref_init(&t->refcount);
> > > > > > > > > +       t->new_stall = false;
> > > > > > > > >
> > > > > > > > >         mutex_lock(&group->trigger_lock);
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > 1.9.1
> > > > > > > > >
diff mbox series

Patch

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0a23300..9533d2e 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -132,6 +132,8 @@  struct psi_trigger {
 
 	/* Refcounting to prevent premature destruction */
 	struct kref refcount;
+
+	bool new_stall;
 };
 
 struct psi_group {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2b..402718c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -458,9 +458,12 @@  static void psi_avgs_work(struct work_struct *work)
 static void window_reset(struct psi_window *win, u64 now, u64 value,
 			 u64 prev_growth)
 {
+	struct psi_trigger *t = container_of(win, struct psi_trigger, win);
+
 	win->start_time = now;
 	win->start_value = value;
 	win->prev_growth = prev_growth;
+	t->new_stall = false;
 }
 
 /*
@@ -515,7 +518,6 @@  static void init_triggers(struct psi_group *group, u64 now)
 static u64 update_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
-	bool new_stall = false;
 	u64 *total = group->total[PSI_POLL];
 
 	/*
@@ -523,19 +525,26 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 	 * watchers know when their specified thresholds are exceeded.
 	 */
 	list_for_each_entry(t, &group->triggers, node) {
-		u64 growth;
-
 		/* Check for stall activity */
 		if (group->polling_total[t->state] == total[t->state])
 			continue;
 
 		/*
-		 * Multiple triggers might be looking at the same state,
-		 * remember to update group->polling_total[] once we've
-		 * been through all of them. Also remember to extend the
-		 * polling time if we see new stall activity.
+		 * update the trigger if there is new stall which will be
+		 * reset when run out of the window
 		 */
-		new_stall = true;
+		t->new_stall = true;
+
+		memcpy(&group->polling_total[t->state], &total[t->state],
+				sizeof(group->polling_total[t->state]));
+	}
+
+	list_for_each_entry(t, &group->triggers, node) {
+		u64 growth;
+
+		/* check if new stall happened during this window*/
+		if (!t->new_stall)
+			continue;
 
 		/* Calculate growth since last update */
 		growth = window_update(&t->win, now, total[t->state]);
@@ -552,10 +561,6 @@  static u64 update_triggers(struct psi_group *group, u64 now)
 		t->last_event_time = now;
 	}
 
-	if (new_stall)
-		memcpy(group->polling_total, total,
-				sizeof(group->polling_total));
-
 	return now + group->poll_min_period;
 }
 
@@ -1152,6 +1157,7 @@  struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
 	kref_init(&t->refcount);
+	t->new_stall = false;
 
 	mutex_lock(&group->trigger_lock);