diff mbox series

[v1,3/9] x86/resctrl: Add resctrl_mbm_flush_cpu() to collect CPUs' MBM events

Message ID 20230421141723.2405942-4-peternewman@google.com (mailing list archive)
State New
Headers show
Series x86/resctrl: Use soft RMIDs for reliable MBM on AMD | expand

Commit Message

Peter Newman April 21, 2023, 2:17 p.m. UTC
AMD implementations so far are only guaranteed to provide MBM event
counts for RMIDs which are currently assigned in CPUs' PQR_ASSOC MSRs.
Hardware can reallocate the counter resources for all other RMIDs' which
are not currently assigned to those which are, zeroing the event counts
of the unassigned RMIDs.

In practice, this makes it impossible to simultaneously calculate the
memory bandwidth speed of all RMIDs on a busy system where all RMIDs are
in use. Over a multiple-second measurement window, the RMID would need
to remain assigned in all of the L3 cache domains where it has been
assigned for the duration of the measurement, otherwise portions of the
final count will be zero. In general, it is not possible to bound the
number of RMIDs which will be assigned in an L3 domain over any interval
of time.

To provide reliable MBM counts on such systems, introduce "soft" RMIDs:
when enabled, each CPU is permanently assigned a hardware RMID whose
event counts are flushed to the current soft RMID during context
switches which result in a change in soft RMID as well as whenever
userspace requests the current event count for a domain.

Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
event counts into its current software RMID. The delta for each CPU is
determined by tracking the previous event counts in per-CPU data.  The
software byte counts reside in the arch-independent mbm_state
structures.

Co-developed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Newman <peternewman@google.com>
---
 arch/x86/include/asm/resctrl.h         |  2 +
 arch/x86/kernel/cpu/resctrl/internal.h | 10 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 78 ++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 4 deletions(-)

Comments

Reinette Chatre May 11, 2023, 9:37 p.m. UTC | #1
Hi Peter,

On 4/21/2023 7:17 AM, Peter Newman wrote:
> AMD implementations so far are only guaranteed to provide MBM event
> counts for RMIDs which are currently assigned in CPUs' PQR_ASSOC MSRs.
> Hardware can reallocate the counter resources for all other RMIDs' which
> are not currently assigned to those which are, zeroing the event counts
> of the unassigned RMIDs.
> 
> In practice, this makes it impossible to simultaneously calculate the
> memory bandwidth speed of all RMIDs on a busy system where all RMIDs are
> in use. Over a multiple-second measurement window, the RMID would need
> to remain assigned in all of the L3 cache domains where it has been
> assigned for the duration of the measurement, otherwise portions of the
> final count will be zero. In general, it is not possible to bound the
> number of RMIDs which will be assigned in an L3 domain over any interval
> of time.
> 
> To provide reliable MBM counts on such systems, introduce "soft" RMIDs:
> when enabled, each CPU is permanently assigned a hardware RMID whose
> event counts are flushed to the current soft RMID during context
> switches which result in a change in soft RMID as well as whenever
> userspace requests the current event count for a domain.
> 
> Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
> event counts into its current software RMID. The delta for each CPU is
> determined by tracking the previous event counts in per-CPU data.  The
> software byte counts reside in the arch-independent mbm_state
> structures.

Could you elaborate why the arch-independent mbm_state was chosen? 

> 
> Co-developed-by: Stephane Eranian <eranian@google.com>
> Signed-off-by: Stephane Eranian <eranian@google.com>
> Signed-off-by: Peter Newman <peternewman@google.com>
> ---
>  arch/x86/include/asm/resctrl.h         |  2 +
>  arch/x86/kernel/cpu/resctrl/internal.h | 10 ++--
>  arch/x86/kernel/cpu/resctrl/monitor.c  | 78 ++++++++++++++++++++++++++
>  3 files changed, 86 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
> index 255a78d9d906..e7acf118d770 100644
> --- a/arch/x86/include/asm/resctrl.h
> +++ b/arch/x86/include/asm/resctrl.h
> @@ -13,6 +13,7 @@
>   * @cur_closid:	The cached Class Of Service ID
>   * @default_rmid:	The user assigned Resource Monitoring ID
>   * @default_closid:	The user assigned cached Class Of Service ID
> + * @hw_rmid:	The permanently-assigned RMID when soft RMIDs are in use
>   *
>   * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
>   * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
> @@ -27,6 +28,7 @@ struct resctrl_pqr_state {
>  	u32			cur_closid;
>  	u32			default_rmid;
>  	u32			default_closid;
> +	u32			hw_rmid;
>  };
>  
>  DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 02a062558c67..256eee05d447 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -298,12 +298,14 @@ struct rftype {
>   * @prev_bw:	The most recent bandwidth in MBps
>   * @delta_bw:	Difference between the current and previous bandwidth
>   * @delta_comp:	Indicates whether to compute the delta_bw
> + * @soft_rmid_bytes: Recent bandwidth count in bytes when using soft RMIDs
>   */
>  struct mbm_state {
> -	u64	prev_bw_bytes;
> -	u32	prev_bw;
> -	u32	delta_bw;
> -	bool	delta_comp;
> +	u64		prev_bw_bytes;
> +	u32		prev_bw;
> +	u32		delta_bw;
> +	bool		delta_comp;
> +	atomic64_t	soft_rmid_bytes;
>  };
>  
>  /**
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 2de8397f91cd..3671100d3cc7 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -404,6 +404,84 @@ static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
>  	}
>  }
>  
> +struct mbm_soft_counter {
> +	u64	prev_bytes;
> +	bool	initialized;
> +};
> +
> +struct mbm_flush_state {
> +	struct mbm_soft_counter local;
> +	struct mbm_soft_counter total;
> +};
> +
> +DEFINE_PER_CPU(struct mbm_flush_state, flush_state);
> +

Why not use the existing MBM state? 

> +/*
> + * flushes the value of the cpu_rmid to the current soft rmid
> + */
> +static void __mbm_flush(int evtid, struct rdt_resource *r, struct rdt_domain *d)
> +{
> +	struct mbm_flush_state *state = this_cpu_ptr(&flush_state);
> +	u32 soft_rmid = this_cpu_ptr(&pqr_state)->cur_rmid;
> +	u32 hw_rmid = this_cpu_ptr(&pqr_state)->hw_rmid;
> +	struct mbm_soft_counter *counter;
> +	struct mbm_state *m;
> +	u64 val;
> +
> +	/* cache occupancy events are disabled in this mode */
> +	WARN_ON(!is_mbm_event(evtid));

If this is hit it would trigger a lot, perhaps WARN_ON_ONCE?

> +
> +	if (evtid == QOS_L3_MBM_LOCAL_EVENT_ID) {
> +		counter = &state->local;
> +	} else {
> +		WARN_ON(evtid != QOS_L3_MBM_TOTAL_EVENT_ID);
> +		counter = &state->total;
> +	}
> +
> +	/*
> +	 * Propagate the value read from the hw_rmid assigned to the current CPU
> +	 * into the "soft" rmid associated with the current task or CPU.
> +	 */
> +	m = get_mbm_state(d, soft_rmid, evtid);
> +	if (!m)
> +		return;
> +
> +	if (resctrl_arch_rmid_read(r, d, hw_rmid, evtid, &val))
> +		return;
> +

This all seems unsafe to run without protection. The code relies on
the rdt_domain but a CPU hotplug event could result in the domain
disappearing underneath this code. The accesses to the data structures
also appear unsafe to me. Note that resctrl_arch_rmid_read() updates
the architectural MBM state and this same state can be updated concurrently
in other code paths without appropriate locking.

> +	/* Count bandwidth after the first successful counter read. */
> +	if (counter->initialized) {
> +		/* Assume that mbm_update() will prevent double-overflows. */
> +		if (val != counter->prev_bytes)
> +			atomic64_add(val - counter->prev_bytes,
> +				     &m->soft_rmid_bytes);
> +	} else {
> +		counter->initialized = true;
> +	}
> +
> +	counter->prev_bytes = val;

I notice a lot of similarities between the above and the software controller,
see mbm_bw_count(). 

> +}
> +
> +/*
> + * Called from context switch code __resctrl_sched_in() when the current soft
> + * RMID is changing or before reporting event counts to user space.
> + */
> +void resctrl_mbm_flush_cpu(void)
> +{
> +	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> +	int cpu = smp_processor_id();
> +	struct rdt_domain *d;
> +
> +	d = get_domain_from_cpu(cpu, r);
> +	if (!d)
> +		return;
> +
> +	if (is_mbm_local_enabled())
> +		__mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
> +	if (is_mbm_total_enabled())
> +		__mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
> +}

This (potentially) adds two MSR writes and two MSR reads to what could possibly
be quite slow MSRs if it was not designed to be used in context switch. Do you
perhaps have data on how long these MSR reads/writes take on these systems to get
an idea about the impact on context switch? I think this data should feature
prominently in the changelog.

> +
>  static int __mon_event_count(u32 rmid, struct rmid_read *rr)
>  {
>  	struct mbm_state *m;


Reinette
Peter Newman May 12, 2023, 1:25 p.m. UTC | #2
Hi Reinette,

On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 4/21/2023 7:17 AM, Peter Newman wrote:
> > Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
> > event counts into its current software RMID. The delta for each CPU is
> > determined by tracking the previous event counts in per-CPU data.  The
> > software byte counts reside in the arch-independent mbm_state
> > structures.
>
> Could you elaborate why the arch-independent mbm_state was chosen?

It largely had to do with how many soft RMIDs to implement. For our
own needs, we were mainly concerned with getting back to the number of
monitoring groups the hardware claimed to support, so there wasn't
much internal motivation to support an unbounded number of soft RMIDs.

However, breaking this artificial connection between supported HW and
SW RMIDs to support arbitrarily-many monitoring groups could make the
implementation conceptually cleaner. If you agree,  I would be happy
to give it a try in the next series.


> > +     /* cache occupancy events are disabled in this mode */
> > +     WARN_ON(!is_mbm_event(evtid));
>
> If this is hit it would trigger a lot, perhaps WARN_ON_ONCE?

Ok

>
> > +
> > +     if (evtid == QOS_L3_MBM_LOCAL_EVENT_ID) {
> > +             counter = &state->local;
> > +     } else {
> > +             WARN_ON(evtid != QOS_L3_MBM_TOTAL_EVENT_ID);
> > +             counter = &state->total;
> > +     }
> > +
> > +     /*
> > +      * Propagate the value read from the hw_rmid assigned to the current CPU
> > +      * into the "soft" rmid associated with the current task or CPU.
> > +      */
> > +     m = get_mbm_state(d, soft_rmid, evtid);
> > +     if (!m)
> > +             return;
> > +
> > +     if (resctrl_arch_rmid_read(r, d, hw_rmid, evtid, &val))
> > +             return;
> > +
>
> This all seems unsafe to run without protection. The code relies on
> the rdt_domain but a CPU hotplug event could result in the domain
> disappearing underneath this code. The accesses to the data structures
> also appear unsafe to me. Note that resctrl_arch_rmid_read() updates
> the architectural MBM state and this same state can be updated concurrently
> in other code paths without appropriate locking.

The domain is supposed to always be the current one, but I see that
even a get_domain_from_cpu(smp_processor_id(), ...) call needs to walk
a resource's domain list to find a matching entry, which could be
concurrently modified when other domains are added/removed.

Similarly, when soft RMIDs are enabled, it should not be possible to
call resctrl_arch_rmid_read() outside of on the current CPU's HW RMID.

I'll need to confirm whether it's safe to access the current CPU's
rdt_domain in an atomic context. If it isn't, I assume I would have to
arrange all of the state used during flush to be per-CPU.

I expect the constraints on what data can be safely accessed where is
going to constrain how the state is ultimately arranged, so I will
need to settle this before I can come back to the other questions
about mbm_state.

>
> > +     /* Count bandwidth after the first successful counter read. */
> > +     if (counter->initialized) {
> > +             /* Assume that mbm_update() will prevent double-overflows. */
> > +             if (val != counter->prev_bytes)
> > +                     atomic64_add(val - counter->prev_bytes,
> > +                                  &m->soft_rmid_bytes);
> > +     } else {
> > +             counter->initialized = true;
> > +     }
> > +
> > +     counter->prev_bytes = val;
>
> I notice a lot of similarities between the above and the software controller,
> see mbm_bw_count().

Thanks for pointing this out, I'll take a look.

>
> > +}
> > +
> > +/*
> > + * Called from context switch code __resctrl_sched_in() when the current soft
> > + * RMID is changing or before reporting event counts to user space.
> > + */
> > +void resctrl_mbm_flush_cpu(void)
> > +{
> > +     struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> > +     int cpu = smp_processor_id();
> > +     struct rdt_domain *d;
> > +
> > +     d = get_domain_from_cpu(cpu, r);
> > +     if (!d)
> > +             return;
> > +
> > +     if (is_mbm_local_enabled())
> > +             __mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
> > +     if (is_mbm_total_enabled())
> > +             __mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
> > +}
>
> This (potentially) adds two MSR writes and two MSR reads to what could possibly
> be quite slow MSRs if it was not designed to be used in context switch. Do you
> perhaps have data on how long these MSR reads/writes take on these systems to get
> an idea about the impact on context switch? I think this data should feature
> prominently in the changelog.

I can probably use ftrace to determine the cost of an __rmid_read()
call on a few implementations.

To understand the overall impact to context switch, I can put together
a scenario where I can control whether the context switches being
measured result in change of soft RMID to prevent the data from being
diluted by non-flushing switches.


Thank you for looking over these changes!

-Peter
Reinette Chatre May 12, 2023, 3:26 p.m. UTC | #3
Hi Peter,

On 5/12/2023 6:25 AM, Peter Newman wrote:
> Hi Reinette,
> 
> On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 4/21/2023 7:17 AM, Peter Newman wrote:
>>> Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
>>> event counts into its current software RMID. The delta for each CPU is
>>> determined by tracking the previous event counts in per-CPU data.  The
>>> software byte counts reside in the arch-independent mbm_state
>>> structures.
>>
>> Could you elaborate why the arch-independent mbm_state was chosen?
> 
> It largely had to do with how many soft RMIDs to implement. For our
> own needs, we were mainly concerned with getting back to the number of
> monitoring groups the hardware claimed to support, so there wasn't
> much internal motivation to support an unbounded number of soft RMIDs.

Apologies for not being explicit, I was actually curious why the
arch-independent mbm_state, as opposed to the arch-dependent state, was
chosen.

I think the lines are getting a bit blurry here with the software RMID
feature added as a resctrl filesystem feature (and thus non architectural),
but it is specific to AMD architecture. 

> However, breaking this artificial connection between supported HW and
> SW RMIDs to support arbitrarily-many monitoring groups could make the
> implementation conceptually cleaner. If you agree,  I would be happy
> to give it a try in the next series.

I have not actually considered this. At first glance I think this would
add more tentacles into the core where currently the number of RMIDs
supported are queried from the device and supporting an arbitrary number
would impact that. At this time the RMID state is also pre-allocated
and thus not possible to support an "arbitrarily-many".

>>> +/*
>>> + * Called from context switch code __resctrl_sched_in() when the current soft
>>> + * RMID is changing or before reporting event counts to user space.
>>> + */
>>> +void resctrl_mbm_flush_cpu(void)
>>> +{
>>> +     struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
>>> +     int cpu = smp_processor_id();
>>> +     struct rdt_domain *d;
>>> +
>>> +     d = get_domain_from_cpu(cpu, r);
>>> +     if (!d)
>>> +             return;
>>> +
>>> +     if (is_mbm_local_enabled())
>>> +             __mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
>>> +     if (is_mbm_total_enabled())
>>> +             __mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
>>> +}
>>
>> This (potentially) adds two MSR writes and two MSR reads to what could possibly
>> be quite slow MSRs if it was not designed to be used in context switch. Do you
>> perhaps have data on how long these MSR reads/writes take on these systems to get
>> an idea about the impact on context switch? I think this data should feature
>> prominently in the changelog.
> 
> I can probably use ftrace to determine the cost of an __rmid_read()
> call on a few implementations.

On a lower level I think it may be interesting to measure more closely
just how long a wrmsr and rdmsr take on these registers. It may be interesting
if you, for example, use rdtsc_ordered() before and after these calls, and then
compare it to how long it takes to write the PQR register that has been
designed to be used in context switch.

> To understand the overall impact to context switch, I can put together
> a scenario where I can control whether the context switches being
> measured result in change of soft RMID to prevent the data from being
> diluted by non-flushing switches.

This sounds great. Thank you very much.

Reinette
Peter Newman May 15, 2023, 2:42 p.m. UTC | #4
Hi Reinette,

On Fri, May 12, 2023 at 5:26 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 5/12/2023 6:25 AM, Peter Newman wrote:
> > On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
> > <reinette.chatre@intel.com> wrote:
> >> On 4/21/2023 7:17 AM, Peter Newman wrote:
> >>> Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
> >>> event counts into its current software RMID. The delta for each CPU is
> >>> determined by tracking the previous event counts in per-CPU data.  The
> >>> software byte counts reside in the arch-independent mbm_state
> >>> structures.
> >>
> >> Could you elaborate why the arch-independent mbm_state was chosen?
> >
> > It largely had to do with how many soft RMIDs to implement. For our
> > own needs, we were mainly concerned with getting back to the number of
> > monitoring groups the hardware claimed to support, so there wasn't
> > much internal motivation to support an unbounded number of soft RMIDs.
>
> Apologies for not being explicit, I was actually curious why the
> arch-independent mbm_state, as opposed to the arch-dependent state, was
> chosen.
>
> I think the lines are getting a bit blurry here with the software RMID
> feature added as a resctrl filesystem feature (and thus non architectural),
> but it is specific to AMD architecture.

The soft RMID solution applies conceptually to any system where the
number of hardware counters is smaller than the number of desired
monitoring groups, but at least as large as the number of CPUs. It's a
solution we may need to rely on more in the future as it's easier for
monitoring hardware to scale to the number of CPUs than (CPUs *
mbm_domains). I believed the counts in bytes would apply to the user
interface universally.

However, I did recently rebase these changes onto one of James's MPAM
snapshot branches and __mbm_flush() did end up fitting better on the
arch-dependent side, so I was forced to move the counters over to
arch_mbm_state because on the snapshot branch the arch-dependent code
cannot see the arch-independent mbm_state structure. I then created
resctrl_arch-() helpers for __mon_event_count() to read the counts
from the arch_mbm_state.

In hindsight, despite generic-looking code being able to retrieve the
CPU counts with resctrl_arch_rmid_read(), the permanent assignment of
a HW RMID to a CPU is an implementation-detail specific to the
RDT/PQoS interface and would probably not align to a theoretical MPAM
implementation.

>
> > However, breaking this artificial connection between supported HW and
> > SW RMIDs to support arbitrarily-many monitoring groups could make the
> > implementation conceptually cleaner. If you agree,  I would be happy
> > to give it a try in the next series.
>
> I have not actually considered this. At first glance I think this would
> add more tentacles into the core where currently the number of RMIDs
> supported are queried from the device and supporting an arbitrary number
> would impact that. At this time the RMID state is also pre-allocated
> and thus not possible to support an "arbitrarily-many".

Yes, this was the part that made me want to just leave the RMID count alone.


>
> >>> +/*
> >>> + * Called from context switch code __resctrl_sched_in() when the current soft
> >>> + * RMID is changing or before reporting event counts to user space.
> >>> + */
> >>> +void resctrl_mbm_flush_cpu(void)
> >>> +{
> >>> +     struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> >>> +     int cpu = smp_processor_id();
> >>> +     struct rdt_domain *d;
> >>> +
> >>> +     d = get_domain_from_cpu(cpu, r);
> >>> +     if (!d)
> >>> +             return;
> >>> +
> >>> +     if (is_mbm_local_enabled())
> >>> +             __mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
> >>> +     if (is_mbm_total_enabled())
> >>> +             __mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
> >>> +}
> >>
> >> This (potentially) adds two MSR writes and two MSR reads to what could possibly
> >> be quite slow MSRs if it was not designed to be used in context switch. Do you
> >> perhaps have data on how long these MSR reads/writes take on these systems to get
> >> an idea about the impact on context switch? I think this data should feature
> >> prominently in the changelog.
> >
> > I can probably use ftrace to determine the cost of an __rmid_read()
> > call on a few implementations.
>
> On a lower level I think it may be interesting to measure more closely
> just how long a wrmsr and rdmsr take on these registers. It may be interesting
> if you, for example, use rdtsc_ordered() before and after these calls, and then
> compare it to how long it takes to write the PQR register that has been
> designed to be used in context switch.
>
> > To understand the overall impact to context switch, I can put together
> > a scenario where I can control whether the context switches being
> > measured result in change of soft RMID to prevent the data from being
> > diluted by non-flushing switches.
>
> This sounds great. Thank you very much.

I used a simple parent-child pipe loop benchmark with the parent in
one monitoring group and the child in another to trigger 2M
context-switches on the same CPU and compared the sample-based
profiles on an AMD and Intel implementation. I used perf diff to
compare the samples between hard and soft RMID switches.

Intel(R) Xeon(R) Platinum 8173M CPU @ 2.00GHz:

              +44.80%  [kernel.kallsyms]  [k] __rmid_read
    10.43%     -9.52%  [kernel.kallsyms]  [k] __switch_to

AMD EPYC 7B12 64-Core Processor:

              +28.27%  [kernel.kallsyms]  [k] __rmid_read
    13.45%    -13.44%  [kernel.kallsyms]  [k] __switch_to

Note that a soft RMID switch that doesn't change CLOSID skips the
PQR_ASSOC write completely, so from this data I can roughly say that
__rmid_read() is a little over 2x the length of a PQR_ASSOC write that
changes the current RMID on the AMD implementation and about 4.5x
longer on Intel.

Let me know if this clarifies the cost enough or if you'd like to also
see instrumented measurements on the individual WRMSR/RDMSR
instructions.

Thanks!
-Peter
Peter Newman May 16, 2023, 2:18 p.m. UTC | #5
Hi Reinette,

On Fri, May 12, 2023 at 3:25 PM Peter Newman <peternewman@google.com> wrote:
> On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
> > On 4/21/2023 7:17 AM, Peter Newman wrote:
> > > +
> > > +     if (evtid == QOS_L3_MBM_LOCAL_EVENT_ID) {
> > > +             counter = &state->local;
> > > +     } else {
> > > +             WARN_ON(evtid != QOS_L3_MBM_TOTAL_EVENT_ID);
> > > +             counter = &state->total;
> > > +     }
> > > +
> > > +     /*
> > > +      * Propagate the value read from the hw_rmid assigned to the current CPU
> > > +      * into the "soft" rmid associated with the current task or CPU.
> > > +      */
> > > +     m = get_mbm_state(d, soft_rmid, evtid);
> > > +     if (!m)
> > > +             return;
> > > +
> > > +     if (resctrl_arch_rmid_read(r, d, hw_rmid, evtid, &val))
> > > +             return;
> > > +
> >
> > This all seems unsafe to run without protection. The code relies on
> > the rdt_domain but a CPU hotplug event could result in the domain
> > disappearing underneath this code. The accesses to the data structures
> > also appear unsafe to me. Note that resctrl_arch_rmid_read() updates
> > the architectural MBM state and this same state can be updated concurrently
> > in other code paths without appropriate locking.
>
> The domain is supposed to always be the current one, but I see that
> even a get_domain_from_cpu(smp_processor_id(), ...) call needs to walk
> a resource's domain list to find a matching entry, which could be
> concurrently modified when other domains are added/removed.
>
> Similarly, when soft RMIDs are enabled, it should not be possible to
> call resctrl_arch_rmid_read() outside of on the current CPU's HW RMID.
>
> I'll need to confirm whether it's safe to access the current CPU's
> rdt_domain in an atomic context. If it isn't, I assume I would have to
> arrange all of the state used during flush to be per-CPU.
>
> I expect the constraints on what data can be safely accessed where is
> going to constrain how the state is ultimately arranged, so I will
> need to settle this before I can come back to the other questions
> about mbm_state.

According to cpu_hotplug.rst, the startup callbacks are called before
a CPU is started and the teardown callbacks are called after the CPU
has become dysfunctional, so it should always be safe for a CPU to
access its own data, so all I need to do here is avoid walking domain
lists in resctrl_mbm_flush_cpu().

However, this also means that resctrl_{on,off}line_cpu() call
clear_closid_rmid() on a different CPU, so whichever CPU executes
these will zap its own pqr_state struct and PQR_ASSOC MSR.

-Peter
Peter Newman May 16, 2023, 2:27 p.m. UTC | #6
On Tue, May 16, 2023 at 4:18 PM Peter Newman <peternewman@google.com> wrote:
> According to cpu_hotplug.rst, the startup callbacks are called before
> a CPU is started and the teardown callbacks are called after the CPU
> has become dysfunctional, so it should always be safe for a CPU to
> access its own data, so all I need to do here is avoid walking domain
> lists in resctrl_mbm_flush_cpu().
>
> However, this also means that resctrl_{on,off}line_cpu() call
> clear_closid_rmid() on a different CPU, so whichever CPU executes
> these will zap its own pqr_state struct and PQR_ASSOC MSR.

Sorry, I read the wrong section. I was looking at PREPARE section
callbacks. ONLINE callbacks are called on the CPU, so calling
clear_closid_rmid() is fine.

It says the offline callback is called from the per-cpu hotplug
thread, so I'm not sure if that means another context switch is
possible after the teardown handler has run. To be safe, I can make
resctrl_mbm_flush_cpu() check to see if it still has its domain state.

-Peter
Reinette Chatre May 17, 2023, 12:05 a.m. UTC | #7
Hi Peter,

On 5/15/2023 7:42 AM, Peter Newman wrote:
> Hi Reinette,
> 
> On Fri, May 12, 2023 at 5:26 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 5/12/2023 6:25 AM, Peter Newman wrote:
>>> On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
>>> <reinette.chatre@intel.com> wrote:
>>>> On 4/21/2023 7:17 AM, Peter Newman wrote:
>>>>> Implement resctrl_mbm_flush_cpu(), which collects a domain's current MBM
>>>>> event counts into its current software RMID. The delta for each CPU is
>>>>> determined by tracking the previous event counts in per-CPU data.  The
>>>>> software byte counts reside in the arch-independent mbm_state
>>>>> structures.
>>>>
>>>> Could you elaborate why the arch-independent mbm_state was chosen?
>>>
>>> It largely had to do with how many soft RMIDs to implement. For our
>>> own needs, we were mainly concerned with getting back to the number of
>>> monitoring groups the hardware claimed to support, so there wasn't
>>> much internal motivation to support an unbounded number of soft RMIDs.
>>
>> Apologies for not being explicit, I was actually curious why the
>> arch-independent mbm_state, as opposed to the arch-dependent state, was
>> chosen.
>>
>> I think the lines are getting a bit blurry here with the software RMID
>> feature added as a resctrl filesystem feature (and thus non architectural),
>> but it is specific to AMD architecture.
> 
> The soft RMID solution applies conceptually to any system where the
> number of hardware counters is smaller than the number of desired
> monitoring groups, but at least as large as the number of CPUs. It's a
> solution we may need to rely on more in the future as it's easier for
> monitoring hardware to scale to the number of CPUs than (CPUs *
> mbm_domains). I believed the counts in bytes would apply to the user
> interface universally.
> 
> However, I did recently rebase these changes onto one of James's MPAM
> snapshot branches and __mbm_flush() did end up fitting better on the
> arch-dependent side, so I was forced to move the counters over to
> arch_mbm_state because on the snapshot branch the arch-dependent code
> cannot see the arch-independent mbm_state structure. I then created
> resctrl_arch-() helpers for __mon_event_count() to read the counts
> from the arch_mbm_state.
> 
> In hindsight, despite generic-looking code being able to retrieve the
> CPU counts with resctrl_arch_rmid_read(), the permanent assignment of
> a HW RMID to a CPU is an implementation-detail specific to the
> RDT/PQoS interface and would probably not align to a theoretical MPAM
> implementation.

Indeed. There are a couple of points in this work that blurs the clear
boundary that the MPAM enabling is trying to establish. We should keep
this in mind when looking how this should be solved so that it does
not create another item that needs to be untangled. A small but significant
start may be to start referring to this as "soft mon id" or something
else that is generic. Especially since this is proposed as a mount
option and thus tied to the filesystem.


>>>>> +/*
>>>>> + * Called from context switch code __resctrl_sched_in() when the current soft
>>>>> + * RMID is changing or before reporting event counts to user space.
>>>>> + */
>>>>> +void resctrl_mbm_flush_cpu(void)
>>>>> +{
>>>>> +     struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
>>>>> +     int cpu = smp_processor_id();
>>>>> +     struct rdt_domain *d;
>>>>> +
>>>>> +     d = get_domain_from_cpu(cpu, r);
>>>>> +     if (!d)
>>>>> +             return;
>>>>> +
>>>>> +     if (is_mbm_local_enabled())
>>>>> +             __mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
>>>>> +     if (is_mbm_total_enabled())
>>>>> +             __mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
>>>>> +}
>>>>
>>>> This (potentially) adds two MSR writes and two MSR reads to what could possibly
>>>> be quite slow MSRs if it was not designed to be used in context switch. Do you
>>>> perhaps have data on how long these MSR reads/writes take on these systems to get
>>>> an idea about the impact on context switch? I think this data should feature
>>>> prominently in the changelog.
>>>
>>> I can probably use ftrace to determine the cost of an __rmid_read()
>>> call on a few implementations.
>>
>> On a lower level I think it may be interesting to measure more closely
>> just how long a wrmsr and rdmsr take on these registers. It may be interesting
>> if you, for example, use rdtsc_ordered() before and after these calls, and then
>> compare it to how long it takes to write the PQR register that has been
>> designed to be used in context switch.
>>
>>> To understand the overall impact to context switch, I can put together
>>> a scenario where I can control whether the context switches being
>>> measured result in change of soft RMID to prevent the data from being
>>> diluted by non-flushing switches.
>>
>> This sounds great. Thank you very much.
> 
> I used a simple parent-child pipe loop benchmark with the parent in
> one monitoring group and the child in another to trigger 2M
> context-switches on the same CPU and compared the sample-based
> profiles on an AMD and Intel implementation. I used perf diff to
> compare the samples between hard and soft RMID switches.
> 
> Intel(R) Xeon(R) Platinum 8173M CPU @ 2.00GHz:
> 
>               +44.80%  [kernel.kallsyms]  [k] __rmid_read
>     10.43%     -9.52%  [kernel.kallsyms]  [k] __switch_to
> 
> AMD EPYC 7B12 64-Core Processor:
> 
>               +28.27%  [kernel.kallsyms]  [k] __rmid_read
>     13.45%    -13.44%  [kernel.kallsyms]  [k] __switch_to
> 
> Note that a soft RMID switch that doesn't change CLOSID skips the
> PQR_ASSOC write completely, so from this data I can roughly say that
> __rmid_read() is a little over 2x the length of a PQR_ASSOC write that
> changes the current RMID on the AMD implementation and about 4.5x
> longer on Intel.
> 
> Let me know if this clarifies the cost enough or if you'd like to also
> see instrumented measurements on the individual WRMSR/RDMSR
> instructions.

I can see from the data the portion of total time spent in __rmid_read().
It is not clear to me what the impact on a context switch is. Is it
possible to say with this data that: this solution makes a context switch
x% slower?

I think it may be optimistic to view this as a replacement of a PQR write.
As you point out, that requires that a CPU switches between tasks with the
same CLOSID. You demonstrate that resctrl already contributes a significant
delay to __switch_to - this work will increase that much more, it has to
be clear about this impact and motivate that it is acceptable.

Reinette
Peter Newman June 1, 2023, 2:45 p.m. UTC | #8
Hi Reinette,

On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 4/21/2023 7:17 AM, Peter Newman wrote:
> > +     /* Count bandwidth after the first successful counter read. */
> > +     if (counter->initialized) {
> > +             /* Assume that mbm_update() will prevent double-overflows. */
> > +             if (val != counter->prev_bytes)
> > +                     atomic64_add(val - counter->prev_bytes,
> > +                                  &m->soft_rmid_bytes);
> > +     } else {
> > +             counter->initialized = true;
> > +     }
> > +
> > +     counter->prev_bytes = val;
>
> I notice a lot of similarities between the above and the software controller,
> see mbm_bw_count().

I see the "a=now(); a-b; b=a;" and the not handling overflow parts
being similar, but the use of the initialized flag seems quite
different from delta_comp.

Also mbm_state is on the arch-independent side and the new code is
going to the arch-dependent side, so it wouldn't be convenient to try
to use the mbm_bw structures for this.

From this, I don't think trying to reuse this is worth it unless you
have other suggestions.

-Peter
Reinette Chatre June 1, 2023, 5:14 p.m. UTC | #9
Hi Peter,

On 6/1/2023 7:45 AM, Peter Newman wrote:
> Hi Reinette,
> 
> On Thu, May 11, 2023 at 11:37 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 4/21/2023 7:17 AM, Peter Newman wrote:
>>> +     /* Count bandwidth after the first successful counter read. */
>>> +     if (counter->initialized) {
>>> +             /* Assume that mbm_update() will prevent double-overflows. */
>>> +             if (val != counter->prev_bytes)
>>> +                     atomic64_add(val - counter->prev_bytes,
>>> +                                  &m->soft_rmid_bytes);
>>> +     } else {
>>> +             counter->initialized = true;
>>> +     }
>>> +
>>> +     counter->prev_bytes = val;
>>
>> I notice a lot of similarities between the above and the software controller,
>> see mbm_bw_count().
> 
> I see the "a=now(); a-b; b=a;" and the not handling overflow parts
> being similar, but the use of the initialized flag seems quite
> different from delta_comp.
> 
> Also mbm_state is on the arch-independent side and the new code is
> going to the arch-dependent side, so it wouldn't be convenient to try
> to use the mbm_bw structures for this.
> 
> From this, I don't think trying to reuse this is worth it unless you
> have other suggestions.

At this time I am staring at mbm_state->prev_bw_bytes and mbm_soft_counter->prev_bytes
and concerned about how much confusion this would generate. Considering the
pending changes to data structures I hope this would be clear then.

Reinette
Peter Newman Dec. 1, 2023, 8:56 p.m. UTC | #10
Hi Reinette,

On Tue, May 16, 2023 at 5:06 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 5/15/2023 7:42 AM, Peter Newman wrote:
> >
> > I used a simple parent-child pipe loop benchmark with the parent in
> > one monitoring group and the child in another to trigger 2M
> > context-switches on the same CPU and compared the sample-based
> > profiles on an AMD and Intel implementation. I used perf diff to
> > compare the samples between hard and soft RMID switches.
> >
> > Intel(R) Xeon(R) Platinum 8173M CPU @ 2.00GHz:
> >
> >               +44.80%  [kernel.kallsyms]  [k] __rmid_read
> >     10.43%     -9.52%  [kernel.kallsyms]  [k] __switch_to
> >
> > AMD EPYC 7B12 64-Core Processor:
> >
> >               +28.27%  [kernel.kallsyms]  [k] __rmid_read
> >     13.45%    -13.44%  [kernel.kallsyms]  [k] __switch_to
> >
> > Note that a soft RMID switch that doesn't change CLOSID skips the
> > PQR_ASSOC write completely, so from this data I can roughly say that
> > __rmid_read() is a little over 2x the length of a PQR_ASSOC write that
> > changes the current RMID on the AMD implementation and about 4.5x
> > longer on Intel.
> >
> > Let me know if this clarifies the cost enough or if you'd like to also
> > see instrumented measurements on the individual WRMSR/RDMSR
> > instructions.
>
> I can see from the data the portion of total time spent in __rmid_read().
> It is not clear to me what the impact on a context switch is. Is it
> possible to say with this data that: this solution makes a context switch
> x% slower?
>
> I think it may be optimistic to view this as a replacement of a PQR write.
> As you point out, that requires that a CPU switches between tasks with the
> same CLOSID. You demonstrate that resctrl already contributes a significant
> delay to __switch_to - this work will increase that much more, it has to
> be clear about this impact and motivate that it is acceptable.

We were operating under the assumption that if the overhead wasn't
acceptable, we would have heard complaints about it by now, but we
ultimately learned that this feature wasn't deployed as much as we had
originally thought on AMD hardware and that the overhead does need to
be addressed.

I am interested in your opinion on two options I'm exploring to
mitigate the overhead, both of which depend on an API like the one
Babu recently proposed for the AMD ABMC feature [1], where a new file
interface will allow the user to indicate which mon_groups are
actively being measured. I will refer to this as "assigned" for now,
as that's the current proposal.

The first is likely the simpler approach: only read MBM event counters
which have been marked as "assigned" in the filesystem to avoid paying
the context switch cost on tasks in groups which are not actively
being measured. In our use case, we calculate memory bandwidth on
every group every few minutes by reading the counters twice, 5 seconds
apart. We would just need counters read during this 5-second window.

The second involves avoiding the situation where a hardware counter
could be deallocated: Determine the number of simultaneous RMIDs
supported, reduce the effective number of RMIDs available to that
number. Use the default RMID (0) for all "unassigned" monitoring
groups and report "Unavailable" on all counter reads (and address the
default monitoring group's counts being unreliable). When assigned,
attempt to allocate one of the remaining, usable RMIDs to that group.
It would only be possible to assign all event counters (local, total,
occupancy) at the same time. Using this approach, we would no longer
be able to measure all groups at the same time, but this is something
we would already be accepting when using the AMD ABMC feature.

While the second feature is a lot more disruptive at the filesystem
layer, it does eliminate the added context switch overhead. Also, it
may be helpful in the long run for the filesystem code to start taking
a more abstract view of hardware monitoring resources, given that few
implementations can afford to assign hardware to all monitoring IDs
all the time. In both cases, the meaning of "assigned" could vary
greatly, even among AMD implementations.

Thanks!
-Peter

[1] https://lore.kernel.org/lkml/20231201005720.235639-1-babu.moger@amd.com/
Reinette Chatre Dec. 5, 2023, 9:57 p.m. UTC | #11
Hi Peter,

On 12/1/2023 12:56 PM, Peter Newman wrote:
> Hi Reinette,
> 
> On Tue, May 16, 2023 at 5:06 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 5/15/2023 7:42 AM, Peter Newman wrote:
>>>
>>> I used a simple parent-child pipe loop benchmark with the parent in
>>> one monitoring group and the child in another to trigger 2M
>>> context-switches on the same CPU and compared the sample-based
>>> profiles on an AMD and Intel implementation. I used perf diff to
>>> compare the samples between hard and soft RMID switches.
>>>
>>> Intel(R) Xeon(R) Platinum 8173M CPU @ 2.00GHz:
>>>
>>>               +44.80%  [kernel.kallsyms]  [k] __rmid_read
>>>     10.43%     -9.52%  [kernel.kallsyms]  [k] __switch_to
>>>
>>> AMD EPYC 7B12 64-Core Processor:
>>>
>>>               +28.27%  [kernel.kallsyms]  [k] __rmid_read
>>>     13.45%    -13.44%  [kernel.kallsyms]  [k] __switch_to
>>>
>>> Note that a soft RMID switch that doesn't change CLOSID skips the
>>> PQR_ASSOC write completely, so from this data I can roughly say that
>>> __rmid_read() is a little over 2x the length of a PQR_ASSOC write that
>>> changes the current RMID on the AMD implementation and about 4.5x
>>> longer on Intel.
>>>
>>> Let me know if this clarifies the cost enough or if you'd like to also
>>> see instrumented measurements on the individual WRMSR/RDMSR
>>> instructions.
>>
>> I can see from the data the portion of total time spent in __rmid_read().
>> It is not clear to me what the impact on a context switch is. Is it
>> possible to say with this data that: this solution makes a context switch
>> x% slower?
>>
>> I think it may be optimistic to view this as a replacement of a PQR write.
>> As you point out, that requires that a CPU switches between tasks with the
>> same CLOSID. You demonstrate that resctrl already contributes a significant
>> delay to __switch_to - this work will increase that much more, it has to
>> be clear about this impact and motivate that it is acceptable.
> 
> We were operating under the assumption that if the overhead wasn't
> acceptable, we would have heard complaints about it by now, but we
> ultimately learned that this feature wasn't deployed as much as we had
> originally thought on AMD hardware and that the overhead does need to
> be addressed.
> 
> I am interested in your opinion on two options I'm exploring to
> mitigate the overhead, both of which depend on an API like the one
> Babu recently proposed for the AMD ABMC feature [1], where a new file
> interface will allow the user to indicate which mon_groups are
> actively being measured. I will refer to this as "assigned" for now,
> as that's the current proposal.
> 
> The first is likely the simpler approach: only read MBM event counters
> which have been marked as "assigned" in the filesystem to avoid paying
> the context switch cost on tasks in groups which are not actively
> being measured. In our use case, we calculate memory bandwidth on
> every group every few minutes by reading the counters twice, 5 seconds
> apart. We would just need counters read during this 5-second window.

I assume that tasks within a monitoring group can be scheduled on any
CPU and from the cover letter of this work I understand that only an
RMID assigned to a processor can be guaranteed to be tracked by hardware.

Are you proposing for this option that you keep this "soft RMID" approach
with CPUs  permanently assigned a "hard RMID" but only update the counts for a
"soft RMID" that is "assigned"? I think that means that the context
switch cost for the monitored group would increase even more than with the
implementation in this series since the counters need to be read on context
switch in as well as context switch out.

If I understand correctly then only one monitoring group can be measured
at a time. If such a measurement takes 5 seconds then theoretically 12 groups
can be measured in one minute. It may be possible to create many more
monitoring groups than this. Would it be possible to reach monitoring
goals in your environment?

> 
> The second involves avoiding the situation where a hardware counter
> could be deallocated: Determine the number of simultaneous RMIDs
> supported, reduce the effective number of RMIDs available to that
> number. Use the default RMID (0) for all "unassigned" monitoring

hmmm ... so on the one side there is "only the RMID within the PQR
register can be guaranteed to be tracked by hardware" and on the 
other side there is "A given implementation may have insufficient
hardware to simultaneously track the bandwidth for all RMID values
that the hardware supports."

From the above there seems to be something in the middle where
some subset of the RMID values supported by hardware can be used
to simultaneously track bandwidth? How can it be determined
what this number of RMID values is?

> groups and report "Unavailable" on all counter reads (and address the
> default monitoring group's counts being unreliable). When assigned,
> attempt to allocate one of the remaining, usable RMIDs to that group.
> It would only be possible to assign all event counters (local, total,
> occupancy) at the same time. Using this approach, we would no longer
> be able to measure all groups at the same time, but this is something
> we would already be accepting when using the AMD ABMC feature.

It may be possible to turn this into a "fake"/"software" ABMC feature,
which I expect needs to be renamed to move it away from a hardware
specific feature to something that better reflects how user interacts
with system and how the system responds.

> 
> While the second feature is a lot more disruptive at the filesystem
> layer, it does eliminate the added context switch overhead. Also, it

Which changes to filesystem layer are you anticipating?

> may be helpful in the long run for the filesystem code to start taking
> a more abstract view of hardware monitoring resources, given that few
> implementations can afford to assign hardware to all monitoring IDs
> all the time. In both cases, the meaning of "assigned" could vary
> greatly, even among AMD implementations.
> 
> Thanks!
> -Peter
> 
> [1] https://lore.kernel.org/lkml/20231201005720.235639-1-babu.moger@amd.com/


Reinette
Peter Newman Dec. 6, 2023, 12:33 a.m. UTC | #12
Hi Reinette,

On Tue, Dec 5, 2023 at 1:57 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 12/1/2023 12:56 PM, Peter Newman wrote:
> > On Tue, May 16, 2023 at 5:06 PM Reinette Chatre
> >> I think it may be optimistic to view this as a replacement of a PQR write.
> >> As you point out, that requires that a CPU switches between tasks with the
> >> same CLOSID. You demonstrate that resctrl already contributes a significant
> >> delay to __switch_to - this work will increase that much more, it has to
> >> be clear about this impact and motivate that it is acceptable.
> >
> > We were operating under the assumption that if the overhead wasn't
> > acceptable, we would have heard complaints about it by now, but we
> > ultimately learned that this feature wasn't deployed as much as we had
> > originally thought on AMD hardware and that the overhead does need to
> > be addressed.
> >
> > I am interested in your opinion on two options I'm exploring to
> > mitigate the overhead, both of which depend on an API like the one
> > Babu recently proposed for the AMD ABMC feature [1], where a new file
> > interface will allow the user to indicate which mon_groups are
> > actively being measured. I will refer to this as "assigned" for now,
> > as that's the current proposal.
> >
> > The first is likely the simpler approach: only read MBM event counters
> > which have been marked as "assigned" in the filesystem to avoid paying
> > the context switch cost on tasks in groups which are not actively
> > being measured. In our use case, we calculate memory bandwidth on
> > every group every few minutes by reading the counters twice, 5 seconds
> > apart. We would just need counters read during this 5-second window.
>
> I assume that tasks within a monitoring group can be scheduled on any
> CPU and from the cover letter of this work I understand that only an
> RMID assigned to a processor can be guaranteed to be tracked by hardware.
>
> Are you proposing for this option that you keep this "soft RMID" approach
> with CPUs  permanently assigned a "hard RMID" but only update the counts for a
> "soft RMID" that is "assigned"?

Yes

> I think that means that the context
> switch cost for the monitored group would increase even more than with the
> implementation in this series since the counters need to be read on context
> switch in as well as context switch out.
>
> If I understand correctly then only one monitoring group can be measured
> at a time. If such a measurement takes 5 seconds then theoretically 12 groups
> can be measured in one minute. It may be possible to create many more
> monitoring groups than this. Would it be possible to reach monitoring
> goals in your environment?

We actually measure all of the groups at the same time, so thinking
about this more, the proposed ABMC fix isn't actually a great fit: the
user would have to assign all groups individually when a global
setting would have been fine.

Ignoring any present-day resctrl interfaces, what we minimally need is...

1. global "start measurement", which enables a
read-counters-on-context switch flag, and broadcasts an IPI to all
CPUs to read their current count
2. wait 5 seconds
3. global "end measurement", to IPI all CPUs again for final counts
and clear the flag from step 1

Then the user could read at their leisure all the (frozen) event
counts from memory until the next measurement begins.

In our case, if we're measuring as often as 5 seconds for every
minute, that will already be a 12x aggregate reduction in overhead,
which would be worthwhile enough.

>
> >
> > The second involves avoiding the situation where a hardware counter
> > could be deallocated: Determine the number of simultaneous RMIDs
> > supported, reduce the effective number of RMIDs available to that
> > number. Use the default RMID (0) for all "unassigned" monitoring
>
> hmmm ... so on the one side there is "only the RMID within the PQR
> register can be guaranteed to be tracked by hardware" and on the
> other side there is "A given implementation may have insufficient
> hardware to simultaneously track the bandwidth for all RMID values
> that the hardware supports."
>
> From the above there seems to be something in the middle where
> some subset of the RMID values supported by hardware can be used
> to simultaneously track bandwidth? How can it be determined
> what this number of RMID values is?

In the context of AMD, we could use the smallest number of CPUs in any
L3 domain as a lower bound of the number of counters.

If the number is actually higher, it's not too difficult to probe at
runtime. The technique used by the test script[1] reliably identifies
the number of counters, but some experimentation would be needed to
see how quickly the hardware will repurpose a counter, as the script
today is using way too long of a workload for the kernel to be
invoking.

Maybe a reasonable compromise would be to initialize the HW counter
estimate at the CPUs-per-domain value and add a file node to let the
user increase it if they have better information. The worst that can
happen is the present-day behavior.

>
> > groups and report "Unavailable" on all counter reads (and address the
> > default monitoring group's counts being unreliable). When assigned,
> > attempt to allocate one of the remaining, usable RMIDs to that group.
> > It would only be possible to assign all event counters (local, total,
> > occupancy) at the same time. Using this approach, we would no longer
> > be able to measure all groups at the same time, but this is something
> > we would already be accepting when using the AMD ABMC feature.
>
> It may be possible to turn this into a "fake"/"software" ABMC feature,
> which I expect needs to be renamed to move it away from a hardware
> specific feature to something that better reflects how user interacts
> with system and how the system responds.

Given the similarities in monitoring with ABMC and MPAM, I would want
to see the interface generalized anyways.


>
> >
> > While the second feature is a lot more disruptive at the filesystem
> > layer, it does eliminate the added context switch overhead. Also, it
>
> Which changes to filesystem layer are you anticipating?

Roughly speaking...

1. The proposed "assign" interface would have to become more indirect
to avoid understanding how assign could be implemented on various
platforms.
2. RMID management would have to change, because this would introduce
the option where creating monitoring groups no longer allocates an
RMID. It may be cleaner for the
filesystem to just track whether a group has allocated monitoring
resources or not and let a lower layer understand what the resources
actually are. (and in the default mode, groups can only be created
with pre-allocated resources)

If I get the impression that this is the better approach, I'll build a
prototype on top of the ABMC patches to see how it would go.

So far it seems only the second approach (software ABMC) really ties
in with Babu's work.

Thanks!
-Peter

[1] https://lore.kernel.org/all/20230421141723.2405942-2-peternewman@google.com/
Reinette Chatre Dec. 6, 2023, 1:46 a.m. UTC | #13
Hi Peter,

On 12/5/2023 4:33 PM, Peter Newman wrote:
> On Tue, Dec 5, 2023 at 1:57 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 12/1/2023 12:56 PM, Peter Newman wrote:
>>> On Tue, May 16, 2023 at 5:06 PM Reinette Chatre
>>>> I think it may be optimistic to view this as a replacement of a PQR write.
>>>> As you point out, that requires that a CPU switches between tasks with the
>>>> same CLOSID. You demonstrate that resctrl already contributes a significant
>>>> delay to __switch_to - this work will increase that much more, it has to
>>>> be clear about this impact and motivate that it is acceptable.
>>>
>>> We were operating under the assumption that if the overhead wasn't
>>> acceptable, we would have heard complaints about it by now, but we
>>> ultimately learned that this feature wasn't deployed as much as we had
>>> originally thought on AMD hardware and that the overhead does need to
>>> be addressed.
>>>
>>> I am interested in your opinion on two options I'm exploring to
>>> mitigate the overhead, both of which depend on an API like the one
>>> Babu recently proposed for the AMD ABMC feature [1], where a new file
>>> interface will allow the user to indicate which mon_groups are
>>> actively being measured. I will refer to this as "assigned" for now,
>>> as that's the current proposal.
>>>
>>> The first is likely the simpler approach: only read MBM event counters
>>> which have been marked as "assigned" in the filesystem to avoid paying
>>> the context switch cost on tasks in groups which are not actively
>>> being measured. In our use case, we calculate memory bandwidth on
>>> every group every few minutes by reading the counters twice, 5 seconds
>>> apart. We would just need counters read during this 5-second window.
>>
>> I assume that tasks within a monitoring group can be scheduled on any
>> CPU and from the cover letter of this work I understand that only an
>> RMID assigned to a processor can be guaranteed to be tracked by hardware.
>>
>> Are you proposing for this option that you keep this "soft RMID" approach
>> with CPUs  permanently assigned a "hard RMID" but only update the counts for a
>> "soft RMID" that is "assigned"?
> 
> Yes
> 
>> I think that means that the context
>> switch cost for the monitored group would increase even more than with the
>> implementation in this series since the counters need to be read on context
>> switch in as well as context switch out.
>>
>> If I understand correctly then only one monitoring group can be measured
>> at a time. If such a measurement takes 5 seconds then theoretically 12 groups
>> can be measured in one minute. It may be possible to create many more
>> monitoring groups than this. Would it be possible to reach monitoring
>> goals in your environment?
> 
> We actually measure all of the groups at the same time, so thinking
> about this more, the proposed ABMC fix isn't actually a great fit: the
> user would have to assign all groups individually when a global
> setting would have been fine.
> 
> Ignoring any present-day resctrl interfaces, what we minimally need is...
> 
> 1. global "start measurement", which enables a
> read-counters-on-context switch flag, and broadcasts an IPI to all
> CPUs to read their current count
> 2. wait 5 seconds
> 3. global "end measurement", to IPI all CPUs again for final counts
> and clear the flag from step 1
> 
> Then the user could read at their leisure all the (frozen) event
> counts from memory until the next measurement begins.
> 
> In our case, if we're measuring as often as 5 seconds for every
> minute, that will already be a 12x aggregate reduction in overhead,
> which would be worthwhile enough.

The "con" here would be that during those 5 seconds (which I assume would be
controlled via user space so potentially shorter or longer) all tasks in the
system is expected to have significant (but yet to be measured) impact
on context switch delay.
I expect the overflow handler should only be run during the measurement
timeframe, to not defeat the "at their leisure" reading of counters.

>>> The second involves avoiding the situation where a hardware counter
>>> could be deallocated: Determine the number of simultaneous RMIDs
>>> supported, reduce the effective number of RMIDs available to that
>>> number. Use the default RMID (0) for all "unassigned" monitoring
>>
>> hmmm ... so on the one side there is "only the RMID within the PQR
>> register can be guaranteed to be tracked by hardware" and on the
>> other side there is "A given implementation may have insufficient
>> hardware to simultaneously track the bandwidth for all RMID values
>> that the hardware supports."
>>
>> From the above there seems to be something in the middle where
>> some subset of the RMID values supported by hardware can be used
>> to simultaneously track bandwidth? How can it be determined
>> what this number of RMID values is?
> 
> In the context of AMD, we could use the smallest number of CPUs in any
> L3 domain as a lower bound of the number of counters.

Could you please elaborate on this? (With the numbers of CPUs nowadays this
may be many RMIDs, perhaps even more than what ABMC supports.)

I am missing something here since it is not obvious to me how this lower
bound is determined. Let's assume that there are as many monitor groups
(and thus as many assigned RMIDs) as there are CPUs in a L3 domain.
Each monitor group may have many tasks. It can be expected that at any
moment in time only a subset of assigned RMIDs are assigned to CPUs
via the CPUs' PQR registers. Of those RMIDs that are not assigned to
CPUs, how can it be certain that they continue to be tracked by hardware?

> If the number is actually higher, it's not too difficult to probe at
> runtime. The technique used by the test script[1] reliably identifies
> the number of counters, but some experimentation would be needed to
> see how quickly the hardware will repurpose a counter, as the script
> today is using way too long of a workload for the kernel to be
> invoking.
> 
> Maybe a reasonable compromise would be to initialize the HW counter
> estimate at the CPUs-per-domain value and add a file node to let the
> user increase it if they have better information. The worst that can
> happen is the present-day behavior.
> 
>>
>>> groups and report "Unavailable" on all counter reads (and address the
>>> default monitoring group's counts being unreliable). When assigned,
>>> attempt to allocate one of the remaining, usable RMIDs to that group.
>>> It would only be possible to assign all event counters (local, total,
>>> occupancy) at the same time. Using this approach, we would no longer
>>> be able to measure all groups at the same time, but this is something
>>> we would already be accepting when using the AMD ABMC feature.
>>
>> It may be possible to turn this into a "fake"/"software" ABMC feature,
>> which I expect needs to be renamed to move it away from a hardware
>> specific feature to something that better reflects how user interacts
>> with system and how the system responds.
> 
> Given the similarities in monitoring with ABMC and MPAM, I would want
> to see the interface generalized anyways.
> 
> 
>>
>>>
>>> While the second feature is a lot more disruptive at the filesystem
>>> layer, it does eliminate the added context switch overhead. Also, it
>>
>> Which changes to filesystem layer are you anticipating?
> 
> Roughly speaking...
> 
> 1. The proposed "assign" interface would have to become more indirect
> to avoid understanding how assign could be implemented on various
> platforms.

It is almost starting to sound like we could learn from the tracing
interface where individual events can be enabled/disabled ... with several
events potentially enabled with an "enable" done higher in hierarchy, perhaps
even globally to support the first approach ...

> 2. RMID management would have to change, because this would introduce
> the option where creating monitoring groups no longer allocates an
> RMID. It may be cleaner for the
> filesystem to just track whether a group has allocated monitoring
> resources or not and let a lower layer understand what the resources
> actually are. (and in the default mode, groups can only be created
> with pre-allocated resources)

This matches my understanding of what MPAM would need.

> If I get the impression that this is the better approach, I'll build a
> prototype on top of the ABMC patches to see how it would go.
> 
> So far it seems only the second approach (software ABMC) really ties
> in with Babu's work.
> 
> Thanks!
> -Peter
> 
> [1] https://lore.kernel.org/all/20230421141723.2405942-2-peternewman@google.com/

Reinette
Peter Newman Dec. 6, 2023, 6:38 p.m. UTC | #14
Hi Reinette,

On Tue, Dec 5, 2023 at 5:47 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
>
> On 12/5/2023 4:33 PM, Peter Newman wrote:
> > On Tue, Dec 5, 2023 at 1:57 PM Reinette Chatre
> > <reinette.chatre@intel.com> wrote:
> >> On 12/1/2023 12:56 PM, Peter Newman wrote:

> > Ignoring any present-day resctrl interfaces, what we minimally need is...
> >
> > 1. global "start measurement", which enables a
> > read-counters-on-context switch flag, and broadcasts an IPI to all
> > CPUs to read their current count
> > 2. wait 5 seconds
> > 3. global "end measurement", to IPI all CPUs again for final counts
> > and clear the flag from step 1
> >
> > Then the user could read at their leisure all the (frozen) event
> > counts from memory until the next measurement begins.
> >
> > In our case, if we're measuring as often as 5 seconds for every
> > minute, that will already be a 12x aggregate reduction in overhead,
> > which would be worthwhile enough.
>
> The "con" here would be that during those 5 seconds (which I assume would be
> controlled via user space so potentially shorter or longer) all tasks in the
> system is expected to have significant (but yet to be measured) impact
> on context switch delay.

Yes, of course. In the worst case I've measured, Zen2, it's roughly a
1700-cycle context switch penalty (~20%) for tasks in different
monitoring groups. Bad, but the benefit we gain from the per-RMID MBM
data makes up for it several times over if we only pay the cost during a
measurement.

> I expect the overflow handler should only be run during the measurement
> timeframe, to not defeat the "at their leisure" reading of counters.

Yes, correct. We wouldn't be interested in overflows of the hardware
counter when not actively measuring bandwidth.


>
> >>> The second involves avoiding the situation where a hardware counter
> >>> could be deallocated: Determine the number of simultaneous RMIDs
> >>> supported, reduce the effective number of RMIDs available to that
> >>> number. Use the default RMID (0) for all "unassigned" monitoring
> >>
> >> hmmm ... so on the one side there is "only the RMID within the PQR
> >> register can be guaranteed to be tracked by hardware" and on the
> >> other side there is "A given implementation may have insufficient
> >> hardware to simultaneously track the bandwidth for all RMID values
> >> that the hardware supports."
> >>
> >> From the above there seems to be something in the middle where
> >> some subset of the RMID values supported by hardware can be used
> >> to simultaneously track bandwidth? How can it be determined
> >> what this number of RMID values is?
> >
> > In the context of AMD, we could use the smallest number of CPUs in any
> > L3 domain as a lower bound of the number of counters.
>
> Could you please elaborate on this? (With the numbers of CPUs nowadays this
> may be many RMIDs, perhaps even more than what ABMC supports.)

I think the "In the context of AMD" part is key. This feature would only
be applicable to the AMD implementations we have today which do not
implement ABMC.  I believe the difficulties are unique to the topologies
of these systems: many small L3 domains per node with a relatively small
number of CPUs in each. If the L3 domains were large and few, simply
restricting the number of RMIDs and allocating on group creation as we
do today would probably be fine.

> I am missing something here since it is not obvious to me how this lower
> bound is determined. Let's assume that there are as many monitor groups
> (and thus as many assigned RMIDs) as there are CPUs in a L3 domain.
> Each monitor group may have many tasks. It can be expected that at any
> moment in time only a subset of assigned RMIDs are assigned to CPUs
> via the CPUs' PQR registers. Of those RMIDs that are not assigned to
> CPUs, how can it be certain that they continue to be tracked by hardware?

Are you asking whether the counters will ever be reclaimed proactively?
The behavior I've observed is that writing a new RMID into a PQR_ASSOC
register when all hardware counters in the domain are allocated will
trigger the reallocation.

However, I admit the wording in the PQoS spec[1] is only written to
support the permanent-assignment workaround in the current patch series:

"All RMIDs which are currently in use by one or more processors in the
QOS domain will be tracked. The hardware will always begin tracking a
new RMID value when it gets written to the PQR_ASSOC register of any of
the processors in the QOS domain and it is not already being tracked.
When the hardware begins tracking an RMID that it was not previously
tracking, it will clear the QM_CTR for all events in the new RMID."

I would need to confirm whether this is the case and request the
documentation be clarified if it is.


> >>>
> >>> While the second feature is a lot more disruptive at the filesystem
> >>> layer, it does eliminate the added context switch overhead. Also, it
> >>
> >> Which changes to filesystem layer are you anticipating?
> >
> > Roughly speaking...
> >
> > 1. The proposed "assign" interface would have to become more indirect
> > to avoid understanding how assign could be implemented on various
> > platforms.
>
> It is almost starting to sound like we could learn from the tracing
> interface where individual events can be enabled/disabled ... with several
> events potentially enabled with an "enable" done higher in hierarchy, perhaps
> even globally to support the first approach ...

Sorry, can you clarify the part about the tracing interface? Tracing to
support dynamic autoconfiguration of events?

Thanks!
-Peter


 [1] AMD64 Technology Platform Quality of Service Extensions, Revision: 1.03:
     https://bugzilla.kernel.org/attachment.cgi?id=301365
Reinette Chatre Dec. 6, 2023, 8:02 p.m. UTC | #15
Hi Peter,

On 12/6/2023 10:38 AM, Peter Newman wrote:
> Hi Reinette,
> 
> On Tue, Dec 5, 2023 at 5:47 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>>
>> On 12/5/2023 4:33 PM, Peter Newman wrote:
>>> On Tue, Dec 5, 2023 at 1:57 PM Reinette Chatre
>>> <reinette.chatre@intel.com> wrote:
>>>> On 12/1/2023 12:56 PM, Peter Newman wrote:
> 
>>> Ignoring any present-day resctrl interfaces, what we minimally need is...
>>>
>>> 1. global "start measurement", which enables a
>>> read-counters-on-context switch flag, and broadcasts an IPI to all
>>> CPUs to read their current count
>>> 2. wait 5 seconds
>>> 3. global "end measurement", to IPI all CPUs again for final counts
>>> and clear the flag from step 1
>>>
>>> Then the user could read at their leisure all the (frozen) event
>>> counts from memory until the next measurement begins.
>>>
>>> In our case, if we're measuring as often as 5 seconds for every
>>> minute, that will already be a 12x aggregate reduction in overhead,
>>> which would be worthwhile enough.
>>
>> The "con" here would be that during those 5 seconds (which I assume would be
>> controlled via user space so potentially shorter or longer) all tasks in the
>> system is expected to have significant (but yet to be measured) impact
>> on context switch delay.
> 
> Yes, of course. In the worst case I've measured, Zen2, it's roughly a
> 1700-cycle context switch penalty (~20%) for tasks in different
> monitoring groups. Bad, but the benefit we gain from the per-RMID MBM
> data makes up for it several times over if we only pay the cost during a
> measurement.

I see.

> 
>> I expect the overflow handler should only be run during the measurement
>> timeframe, to not defeat the "at their leisure" reading of counters.
> 
> Yes, correct. We wouldn't be interested in overflows of the hardware
> counter when not actively measuring bandwidth.
> 
> 
>>
>>>>> The second involves avoiding the situation where a hardware counter
>>>>> could be deallocated: Determine the number of simultaneous RMIDs
>>>>> supported, reduce the effective number of RMIDs available to that
>>>>> number. Use the default RMID (0) for all "unassigned" monitoring
>>>>
>>>> hmmm ... so on the one side there is "only the RMID within the PQR
>>>> register can be guaranteed to be tracked by hardware" and on the
>>>> other side there is "A given implementation may have insufficient
>>>> hardware to simultaneously track the bandwidth for all RMID values
>>>> that the hardware supports."
>>>>
>>>> From the above there seems to be something in the middle where
>>>> some subset of the RMID values supported by hardware can be used
>>>> to simultaneously track bandwidth? How can it be determined
>>>> what this number of RMID values is?
>>>
>>> In the context of AMD, we could use the smallest number of CPUs in any
>>> L3 domain as a lower bound of the number of counters.
>>
>> Could you please elaborate on this? (With the numbers of CPUs nowadays this
>> may be many RMIDs, perhaps even more than what ABMC supports.)
> 
> I think the "In the context of AMD" part is key. This feature would only
> be applicable to the AMD implementations we have today which do not
> implement ABMC.  I believe the difficulties are unique to the topologies
> of these systems: many small L3 domains per node with a relatively small
> number of CPUs in each. If the L3 domains were large and few, simply
> restricting the number of RMIDs and allocating on group creation as we
> do today would probably be fine.
> 
>> I am missing something here since it is not obvious to me how this lower
>> bound is determined. Let's assume that there are as many monitor groups
>> (and thus as many assigned RMIDs) as there are CPUs in a L3 domain.
>> Each monitor group may have many tasks. It can be expected that at any
>> moment in time only a subset of assigned RMIDs are assigned to CPUs
>> via the CPUs' PQR registers. Of those RMIDs that are not assigned to
>> CPUs, how can it be certain that they continue to be tracked by hardware?
> 
> Are you asking whether the counters will ever be reclaimed proactively?
> The behavior I've observed is that writing a new RMID into a PQR_ASSOC
> register when all hardware counters in the domain are allocated will
> trigger the reallocation.

"When all hardware counters in the domain are allocated" sounds like the
ideal scenario with the kernel knowing how many counters there are and
each counter is associated with a unique RMID. As long as kernel does not
attempt to monitor another RMID this would accurately monitor the
monitor groups with "assigned" RMID.

Adding support for hardware without specification and guaranteed
behavior can potentially run into unexpected scenarios.

For example, there is no guarantee on how the counters are assigned.
The OS and hardware may thus have different view of which hardware
counter is "free". OS may write a new RMID to PQR_ASSOC believing that
there is a counter available while hardware has its own mechanism of
allocation and may reallocate a counter that is in use by an RMID that
the OS believes to be "assigned". I do not think anything prevents
hardware from doing this.


> However, I admit the wording in the PQoS spec[1] is only written to
> support the permanent-assignment workaround in the current patch series:
> 
> "All RMIDs which are currently in use by one or more processors in the
> QOS domain will be tracked. The hardware will always begin tracking a
> new RMID value when it gets written to the PQR_ASSOC register of any of
> the processors in the QOS domain and it is not already being tracked.
> When the hardware begins tracking an RMID that it was not previously
> tracking, it will clear the QM_CTR for all events in the new RMID."
> 
> I would need to confirm whether this is the case and request the
> documentation be clarified if it is.

Indeed. Once an RMID is "assigned" then the expectation is that a
counter will be dedicated to it but a PQR_ASSOC register may not see that
RMID for potentially long intervals. With the above guarantees hardware
will be within its rights to reallocate that RMID's counter even if
there are other counters that are "free" from OS perspective.

>>>>> While the second feature is a lot more disruptive at the filesystem
>>>>> layer, it does eliminate the added context switch overhead. Also, it
>>>>
>>>> Which changes to filesystem layer are you anticipating?
>>>
>>> Roughly speaking...
>>>
>>> 1. The proposed "assign" interface would have to become more indirect
>>> to avoid understanding how assign could be implemented on various
>>> platforms.
>>
>> It is almost starting to sound like we could learn from the tracing
>> interface where individual events can be enabled/disabled ... with several
>> events potentially enabled with an "enable" done higher in hierarchy, perhaps
>> even globally to support the first approach ...
> 
> Sorry, can you clarify the part about the tracing interface? Tracing to
> support dynamic autoconfiguration of events?

I do not believe we are attempting to do anything revolutionary here so
I would like to consider other interfaces that user space may be
familiar and comfortable with. The first that came to mind was the
tracefs interface and how user space interacts with it to enable
trace events. tracefs uses the "enable" file that is present at
different levels of the hierarchy that user space can use to
enable tracing of all events in hierarchy. There is also the
global "tracing_on" that user space can use to dynamically start/stop
tracing without needing to frequently enable/disable events of interest.

I do see some parallels with the discussions we have been having. I am not
proposing that we adapt tracefs interface, but instead that we can perhaps
learn from it.

Reinette
diff mbox series

Patch

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 255a78d9d906..e7acf118d770 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -13,6 +13,7 @@ 
  * @cur_closid:	The cached Class Of Service ID
  * @default_rmid:	The user assigned Resource Monitoring ID
  * @default_closid:	The user assigned cached Class Of Service ID
+ * @hw_rmid:	The permanently-assigned RMID when soft RMIDs are in use
  *
  * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
@@ -27,6 +28,7 @@  struct resctrl_pqr_state {
 	u32			cur_closid;
 	u32			default_rmid;
 	u32			default_closid;
+	u32			hw_rmid;
 };
 
 DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 02a062558c67..256eee05d447 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -298,12 +298,14 @@  struct rftype {
  * @prev_bw:	The most recent bandwidth in MBps
  * @delta_bw:	Difference between the current and previous bandwidth
  * @delta_comp:	Indicates whether to compute the delta_bw
+ * @soft_rmid_bytes: Recent bandwidth count in bytes when using soft RMIDs
  */
 struct mbm_state {
-	u64	prev_bw_bytes;
-	u32	prev_bw;
-	u32	delta_bw;
-	bool	delta_comp;
+	u64		prev_bw_bytes;
+	u32		prev_bw;
+	u32		delta_bw;
+	bool		delta_comp;
+	atomic64_t	soft_rmid_bytes;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 2de8397f91cd..3671100d3cc7 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -404,6 +404,84 @@  static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
 	}
 }
 
+struct mbm_soft_counter {
+	u64	prev_bytes;
+	bool	initialized;
+};
+
+struct mbm_flush_state {
+	struct mbm_soft_counter local;
+	struct mbm_soft_counter total;
+};
+
+DEFINE_PER_CPU(struct mbm_flush_state, flush_state);
+
+/*
+ * flushes the value of the cpu_rmid to the current soft rmid
+ */
+static void __mbm_flush(int evtid, struct rdt_resource *r, struct rdt_domain *d)
+{
+	struct mbm_flush_state *state = this_cpu_ptr(&flush_state);
+	u32 soft_rmid = this_cpu_ptr(&pqr_state)->cur_rmid;
+	u32 hw_rmid = this_cpu_ptr(&pqr_state)->hw_rmid;
+	struct mbm_soft_counter *counter;
+	struct mbm_state *m;
+	u64 val;
+
+	/* cache occupancy events are disabled in this mode */
+	WARN_ON(!is_mbm_event(evtid));
+
+	if (evtid == QOS_L3_MBM_LOCAL_EVENT_ID) {
+		counter = &state->local;
+	} else {
+		WARN_ON(evtid != QOS_L3_MBM_TOTAL_EVENT_ID);
+		counter = &state->total;
+	}
+
+	/*
+	 * Propagate the value read from the hw_rmid assigned to the current CPU
+	 * into the "soft" rmid associated with the current task or CPU.
+	 */
+	m = get_mbm_state(d, soft_rmid, evtid);
+	if (!m)
+		return;
+
+	if (resctrl_arch_rmid_read(r, d, hw_rmid, evtid, &val))
+		return;
+
+	/* Count bandwidth after the first successful counter read. */
+	if (counter->initialized) {
+		/* Assume that mbm_update() will prevent double-overflows. */
+		if (val != counter->prev_bytes)
+			atomic64_add(val - counter->prev_bytes,
+				     &m->soft_rmid_bytes);
+	} else {
+		counter->initialized = true;
+	}
+
+	counter->prev_bytes = val;
+}
+
+/*
+ * Called from context switch code __resctrl_sched_in() when the current soft
+ * RMID is changing or before reporting event counts to user space.
+ */
+void resctrl_mbm_flush_cpu(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	int cpu = smp_processor_id();
+	struct rdt_domain *d;
+
+	d = get_domain_from_cpu(cpu, r);
+	if (!d)
+		return;
+
+	if (is_mbm_local_enabled())
+		__mbm_flush(QOS_L3_MBM_LOCAL_EVENT_ID, r, d);
+	if (is_mbm_total_enabled())
+		__mbm_flush(QOS_L3_MBM_TOTAL_EVENT_ID, r, d);
+}
+
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
 	struct mbm_state *m;