diff mbox series

[v2,24/24] selftests/resctrl: Rewrite Cache Allocation Technology (CAT) test

Message ID 20230418114506.46788-25-ilpo.jarvinen@linux.intel.com (mailing list archive)
State New
Headers show
Series selftests/resctrl: Fixes, cleanups, and rewritten CAT test | expand

Commit Message

Ilpo Järvinen April 18, 2023, 11:45 a.m. UTC
CAT test spawns two processes into two different control groups with
exclusive schemata. Both the processes alloc a buffer from memory
matching their allocated LLC block size and flush the entire buffer out
of caches. Since the processes are reading through the buffer only once
during the measurement and initially all the buffer was flushed, the
test isn't testing CAT.

Rewrite the CAT test to allocated a buffer sized to half of LLC. Then
perform a sequence of tests with different LLC alloc sizes starting
from half of the CBM bits down to 1-bit CBM. Flush the buffer before
each test and read the buffer twice. Observe the LLC misses on the
second read through the buffer. As the allocated LLC block gets smaller
and smaller, the LLC misses will become larger and larger giving a
strong signal on CAT working properly.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 tools/testing/selftests/resctrl/cache.c    |  20 +-
 tools/testing/selftests/resctrl/cat_test.c | 204 +++++++++------------
 2 files changed, 97 insertions(+), 127 deletions(-)

Comments

Reinette Chatre April 22, 2023, 12:32 a.m. UTC | #1
Hi Ilpo,

On 4/18/2023 4:45 AM, Ilpo Järvinen wrote:
> CAT test spawns two processes into two different control groups with
> exclusive schemata. Both the processes alloc a buffer from memory
> matching their allocated LLC block size and flush the entire buffer out
> of caches. Since the processes are reading through the buffer only once
> during the measurement and initially all the buffer was flushed, the
> test isn't testing CAT.
> 
> Rewrite the CAT test to allocated a buffer sized to half of LLC. Then

"allocated a buffer" -> "allocate a buffer" ?

> perform a sequence of tests with different LLC alloc sizes starting
> from half of the CBM bits down to 1-bit CBM. Flush the buffer before
> each test and read the buffer twice. Observe the LLC misses on the
> second read through the buffer. As the allocated LLC block gets smaller
> and smaller, the LLC misses will become larger and larger giving a
> strong signal on CAT working properly.

Since the changelog starts by describing the CAT test needing two
processes I think it would help to highlight that this test uses a
single process. I think it would also help to describing how the cache
is used by the rest while this test is running.

> 
> Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> ---
>  tools/testing/selftests/resctrl/cache.c    |  20 +-
>  tools/testing/selftests/resctrl/cat_test.c | 204 +++++++++------------
>  2 files changed, 97 insertions(+), 127 deletions(-)
> 
> diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
> index 7970239413da..64f08ba5edc2 100644
> --- a/tools/testing/selftests/resctrl/cache.c
> +++ b/tools/testing/selftests/resctrl/cache.c
> @@ -224,10 +224,10 @@ int measure_llc_resctrl(struct resctrl_val_param *param, int bm_pid)
>   */
>  int cat_val(struct resctrl_val_param *param)
>  {
> -	int memflush = 1, operation = 0, ret = 0;
>  	char *resctrl_val = param->resctrl_val;
>  	unsigned long llc_perf_miss = 0;
>  	pid_t bm_pid;
> +	int ret;
>  
>  	if (strcmp(param->filename, "") == 0)
>  		sprintf(param->filename, "stdio");
> @@ -245,6 +245,10 @@ int cat_val(struct resctrl_val_param *param)
>  	if (ret)
>  		return ret;
>  
> +	ret = alloc_buffer(param->span, 1);
> +	if (ret)
> +		return ret;
> +
>  	initialize_llc_perf();
>  
>  	/* Test runs until the callback setup() tells the test to stop. */
> @@ -256,17 +260,15 @@ int cat_val(struct resctrl_val_param *param)
>  		}
>  		if (ret < 0)
>  			break;
> +
> +		flush_buffer(param->span);
> +		use_buffer(param->span, 0, true);
> +
>  		ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
>  		if (ret)
>  			break;
>  
> -		if (run_fill_buf(param->span, memflush, operation, true)) {
> -			fprintf(stderr, "Error-running fill buffer\n");
> -			ret = -1;
> -			break;
> -		}
> -
> -		sleep(1);
> +		use_buffer(param->span, 0, true);
>  
>  		/* Measure cache miss from perf */
>  		ret = get_llc_perf(&llc_perf_miss);
> @@ -279,6 +281,8 @@ int cat_val(struct resctrl_val_param *param)
>  			break;
>  	}
>  
> +	free_buffer();
> +
>  	return ret;
>  }
>  
> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> index 4b505fdb35d7..85053829b9c5 100644
> --- a/tools/testing/selftests/resctrl/cat_test.c
> +++ b/tools/testing/selftests/resctrl/cat_test.c
> @@ -11,11 +11,12 @@
>  #include "resctrl.h"
>  #include <unistd.h>
>  
> -#define RESULT_FILE_NAME1	"result_cat1"
> -#define RESULT_FILE_NAME2	"result_cat2"
> -#define NUM_OF_RUNS		5
> -#define MAX_DIFF_PERCENT	4
> -#define MAX_DIFF		1000000
> +#define RESULT_FILE_NAME		"result_cat"
> +#define NUM_OF_RUNS			5
> +#define MIN_DIFF_PERCENT_PER_BIT	2

Could you please start a new trend that adds documentation
that explains what this constant means and how it was chosen?

> +
> +static unsigned long current_mask;
> +static long prev_avg_llc_val;
>  
>  /*
>   * Change schemata. Write schemata to specified
> @@ -28,13 +29,24 @@ static int cat_setup(struct resctrl_val_param *p)
>  	int ret = 0;
>  
>  	/* Run NUM_OF_RUNS times */
> -	if (p->num_of_runs >= NUM_OF_RUNS)
> -		return END_OF_TESTS;
> +	if (p->num_of_runs >= NUM_OF_RUNS) {
> +		/* Remove one bit from the consecutive block */
> +		current_mask &= current_mask >> 1;
> +		if (!current_mask)
> +			return END_OF_TESTS;
> +
> +		p->num_of_runs = 0;

This seems like a workaround to get the schemata to be written. It is
problematic since now p->num_of_runs no longer accurately reflects the
number of test runs. I was expecting this mask manipulation to be
in cat_val() so that it is clear how test works instead of part
of the logic handled here.

> +	}
>  
>  	if (p->num_of_runs == 0) {
> -		sprintf(schemata, "%lx", p->mask);
> -		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
> -				     p->resctrl_val);
> +		snprintf(schemata, sizeof(schemata), "%lx", p->mask & ~current_mask);
> +		ret = write_schemata("", schemata, p->cpu_no, p->resctrl_val);
> +		if (ret)
> +			return ret;
> +		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
> +		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no, p->resctrl_val);
> +		if (ret)
> +			return ret;
>  	}
>  	p->num_of_runs++;
>  

...

> @@ -126,7 +162,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
>  	ret = get_mask_no_shareable(cache_type, &long_mask);
>  	if (ret)
>  		return ret;
> -	count_of_bits = count_consecutive_bits(long_mask, NULL);
> +	count_of_bits = count_consecutive_bits(long_mask, &start);
>  
>  	/* Get L3/L2 cache size */
>  	ret = get_cache_size(cpu_no, cache_type, &cache_size);
> @@ -143,99 +179,29 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
>  			       count_of_bits - 1);
>  		return -1;
>  	}
> -
> -	/* Get core id from same socket for running another thread */
> -	sibling_cpu_no = get_core_sibling(cpu_no);

Do any users of get_core_sibling() remain after this?


Reinette
Ilpo Järvinen April 26, 2023, 1:58 p.m. UTC | #2
On Fri, 21 Apr 2023, Reinette Chatre wrote:

> Hi Ilpo,
> 
> On 4/18/2023 4:45 AM, Ilpo Järvinen wrote:
> > CAT test spawns two processes into two different control groups with
> > exclusive schemata. Both the processes alloc a buffer from memory
> > matching their allocated LLC block size and flush the entire buffer out
> > of caches. Since the processes are reading through the buffer only once
> > during the measurement and initially all the buffer was flushed, the
> > test isn't testing CAT.
> > 
> > Rewrite the CAT test to allocated a buffer sized to half of LLC. Then
> 
> "allocated a buffer" -> "allocate a buffer" ?
> 
> > perform a sequence of tests with different LLC alloc sizes starting
> > from half of the CBM bits down to 1-bit CBM. Flush the buffer before
> > each test and read the buffer twice. Observe the LLC misses on the
> > second read through the buffer. As the allocated LLC block gets smaller
> > and smaller, the LLC misses will become larger and larger giving a
> > strong signal on CAT working properly.
> 
> Since the changelog starts by describing the CAT test needing two
> processes I think it would help to highlight that this test uses a
> single process. I think it would also help to describing how the cache
> is used by the rest while this test is running.

Sure, good points, I'll add the info.

> > Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> > ---
> >  tools/testing/selftests/resctrl/cache.c    |  20 +-
> >  tools/testing/selftests/resctrl/cat_test.c | 204 +++++++++------------
> >  2 files changed, 97 insertions(+), 127 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
> > index 7970239413da..64f08ba5edc2 100644
> > --- a/tools/testing/selftests/resctrl/cache.c
> > +++ b/tools/testing/selftests/resctrl/cache.c
> > @@ -224,10 +224,10 @@ int measure_llc_resctrl(struct resctrl_val_param *param, int bm_pid)
> >   */
> >  int cat_val(struct resctrl_val_param *param)
> >  {
> > -	int memflush = 1, operation = 0, ret = 0;
> >  	char *resctrl_val = param->resctrl_val;
> >  	unsigned long llc_perf_miss = 0;
> >  	pid_t bm_pid;
> > +	int ret;
> >  
> >  	if (strcmp(param->filename, "") == 0)
> >  		sprintf(param->filename, "stdio");
> > @@ -245,6 +245,10 @@ int cat_val(struct resctrl_val_param *param)
> >  	if (ret)
> >  		return ret;
> >  
> > +	ret = alloc_buffer(param->span, 1);
> > +	if (ret)
> > +		return ret;
> > +
> >  	initialize_llc_perf();
> >  
> >  	/* Test runs until the callback setup() tells the test to stop. */
> > @@ -256,17 +260,15 @@ int cat_val(struct resctrl_val_param *param)
> >  		}
> >  		if (ret < 0)
> >  			break;
> > +
> > +		flush_buffer(param->span);
> > +		use_buffer(param->span, 0, true);
> > +
> >  		ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
> >  		if (ret)
> >  			break;
> >  
> > -		if (run_fill_buf(param->span, memflush, operation, true)) {
> > -			fprintf(stderr, "Error-running fill buffer\n");
> > -			ret = -1;
> > -			break;
> > -		}
> > -
> > -		sleep(1);
> > +		use_buffer(param->span, 0, true);
> >  
> >  		/* Measure cache miss from perf */
> >  		ret = get_llc_perf(&llc_perf_miss);
> > @@ -279,6 +281,8 @@ int cat_val(struct resctrl_val_param *param)
> >  			break;
> >  	}
> >  
> > +	free_buffer();
> > +
> >  	return ret;
> >  }
> >  
> > diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> > index 4b505fdb35d7..85053829b9c5 100644
> > --- a/tools/testing/selftests/resctrl/cat_test.c
> > +++ b/tools/testing/selftests/resctrl/cat_test.c
> > @@ -11,11 +11,12 @@
> >  #include "resctrl.h"
> >  #include <unistd.h>
> >  
> > -#define RESULT_FILE_NAME1	"result_cat1"
> > -#define RESULT_FILE_NAME2	"result_cat2"
> > -#define NUM_OF_RUNS		5
> > -#define MAX_DIFF_PERCENT	4
> > -#define MAX_DIFF		1000000
> > +#define RESULT_FILE_NAME		"result_cat"
> > +#define NUM_OF_RUNS			5
> > +#define MIN_DIFF_PERCENT_PER_BIT	2
> 
> Could you please start a new trend that adds documentation
> that explains what this constant means and how it was chosen?

I can try although that particular 2 was a bit handwavy that just seems to 
work with the tests I performed.

> > +static unsigned long current_mask;
> > +static long prev_avg_llc_val;
> >  
> >  /*
> >   * Change schemata. Write schemata to specified
> > @@ -28,13 +29,24 @@ static int cat_setup(struct resctrl_val_param *p)
> >  	int ret = 0;
> >  
> >  	/* Run NUM_OF_RUNS times */
> > -	if (p->num_of_runs >= NUM_OF_RUNS)
> > -		return END_OF_TESTS;
> > +	if (p->num_of_runs >= NUM_OF_RUNS) {
> > +		/* Remove one bit from the consecutive block */
> > +		current_mask &= current_mask >> 1;
> > +		if (!current_mask)
> > +			return END_OF_TESTS;
> > +
> > +		p->num_of_runs = 0;
> 
> This seems like a workaround to get the schemata to be written. It is
> problematic since now p->num_of_runs no longer accurately reflects the
> number of test runs.

This is already the case. MBA test works around this very same problem by 
using a custom static variable (runs_per_allocation) which is reset to 0 
every NUM_OF_RUNS tests and not keeping ->num_of_runs at all. If MBA test 
would replace runs_per_allocation with use of ->num_of_runs, it would 
match what the new CAT test does.

Nothing currently relies on ->num_of_runs counting across the different 
"tests" that are run inside CAT and MBA tests. And I don't have anything 
immediately around the corner that would require ->num_of_runs to count 
total number of repetitions that were ran.

I guess it would be possible to attempt to consolidate that second layer
MBA and the rewritten CAT tests need somehow into resctrl_val_param. But 
IMHO that too is low-prio refactor as nothing is broken as is.

> I was expecting this mask manipulation to be
> in cat_val() so that it is clear how test works instead of part
> of the logic handled here.

That seems to be moving into opposite direction from how things are 
currently handled. Doing it in cat_val() would be relying less on 
->setup(). If that's the preferred direction, then the question becomes, 
should CAT test do anything in ->setup() because also the schemata 
writing could be done in directly cat_val().

What I would prefer not to do is to have a rule which says: if there's a 
test-specific function, don't use ->setup() but do any setup directly 
in the test-specific function but, otherwise use ->setup(). Such an
inconsistency would make things hard to track.

> > +	}
> >  
> >  	if (p->num_of_runs == 0) {
> > -		sprintf(schemata, "%lx", p->mask);
> > -		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
> > -				     p->resctrl_val);
> > +		snprintf(schemata, sizeof(schemata), "%lx", p->mask & ~current_mask);
> > +		ret = write_schemata("", schemata, p->cpu_no, p->resctrl_val);
> > +		if (ret)
> > +			return ret;
> > +		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
> > +		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no, p->resctrl_val);
> > +		if (ret)
> > +			return ret;
> >  	}
> >  	p->num_of_runs++;
> >  
> 
> ...
> 
> > @@ -126,7 +162,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
> >  	ret = get_mask_no_shareable(cache_type, &long_mask);
> >  	if (ret)
> >  		return ret;
> > -	count_of_bits = count_consecutive_bits(long_mask, NULL);
> > +	count_of_bits = count_consecutive_bits(long_mask, &start);
> >  
> >  	/* Get L3/L2 cache size */
> >  	ret = get_cache_size(cpu_no, cache_type, &cache_size);
> > @@ -143,99 +179,29 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
> >  			       count_of_bits - 1);
> >  		return -1;
> >  	}
> > -
> > -	/* Get core id from same socket for running another thread */
> > -	sibling_cpu_no = get_core_sibling(cpu_no);
> 
> Do any users of get_core_sibling() remain after this?

Correct observation, there seems to be no other users after this is 
removed.
Reinette Chatre April 26, 2023, 11:35 p.m. UTC | #3
Hi Ilpo,

On 4/26/2023 6:58 AM, Ilpo Järvinen wrote:
> On Fri, 21 Apr 2023, Reinette Chatre wrote:
>> On 4/18/2023 4:45 AM, Ilpo Järvinen wrote:

...

>>> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
>>> index 4b505fdb35d7..85053829b9c5 100644
>>> --- a/tools/testing/selftests/resctrl/cat_test.c
>>> +++ b/tools/testing/selftests/resctrl/cat_test.c
>>> @@ -11,11 +11,12 @@
>>>  #include "resctrl.h"
>>>  #include <unistd.h>
>>>  
>>> -#define RESULT_FILE_NAME1	"result_cat1"
>>> -#define RESULT_FILE_NAME2	"result_cat2"
>>> -#define NUM_OF_RUNS		5
>>> -#define MAX_DIFF_PERCENT	4
>>> -#define MAX_DIFF		1000000
>>> +#define RESULT_FILE_NAME		"result_cat"
>>> +#define NUM_OF_RUNS			5
>>> +#define MIN_DIFF_PERCENT_PER_BIT	2
>>
>> Could you please start a new trend that adds documentation
>> that explains what this constant means and how it was chosen?
> 
> I can try although that particular 2 was a bit handwavy that just seems to 
> work with the tests I performed.

The changelog claims that the existing CAT test does not work with
this new test offered as replacement. Considering that I do think it
is important to have confidence that this test is able to test CAT.
The words "handwave" and "seems to work" are red flags to me.
When merged, these tests will be run on a variety of platforms with
various configurations. Using test criteria based on measurements
from one particular system may work but there needs to be confidence
that the criteria maps to all systems these tests will be run on.

> 
>>> +static unsigned long current_mask;
>>> +static long prev_avg_llc_val;
>>>  
>>>  /*
>>>   * Change schemata. Write schemata to specified
>>> @@ -28,13 +29,24 @@ static int cat_setup(struct resctrl_val_param *p)
>>>  	int ret = 0;
>>>  
>>>  	/* Run NUM_OF_RUNS times */
>>> -	if (p->num_of_runs >= NUM_OF_RUNS)
>>> -		return END_OF_TESTS;
>>> +	if (p->num_of_runs >= NUM_OF_RUNS) {
>>> +		/* Remove one bit from the consecutive block */
>>> +		current_mask &= current_mask >> 1;
>>> +		if (!current_mask)
>>> +			return END_OF_TESTS;
>>> +
>>> +		p->num_of_runs = 0;
>>
>> This seems like a workaround to get the schemata to be written. It is
>> problematic since now p->num_of_runs no longer accurately reflects the
>> number of test runs.
> 
> This is already the case. MBA test works around this very same problem by 
> using a custom static variable (runs_per_allocation) which is reset to 0 
> every NUM_OF_RUNS tests and not keeping ->num_of_runs at all. If MBA test 
> would replace runs_per_allocation with use of ->num_of_runs, it would 
> match what the new CAT test does.
> 
> Nothing currently relies on ->num_of_runs counting across the different 
> "tests" that are run inside CAT and MBA tests. And I don't have anything 
> immediately around the corner that would require ->num_of_runs to count 
> total number of repetitions that were ran.
> 
> I guess it would be possible to attempt to consolidate that second layer
> MBA and the rewritten CAT tests need somehow into resctrl_val_param. But 
> IMHO that too is low-prio refactor as nothing is broken as is.

I do not think that I would use any of the other tests as reference
since all the other tests rely on the same wrapper (resctrl_val())
by providing it their own customization (via aptly named ... struct
resctrl_val_param). 
The CAT test is already unique by _not_ using resctrl_val() but its
own test. I do not see why those resctrl_val() customization need to
propagate to the CAT test if it is not using the wrapper to begin with.

> 
>> I was expecting this mask manipulation to be
>> in cat_val() so that it is clear how test works instead of part
>> of the logic handled here.
> 
> That seems to be moving into opposite direction from how things are 
> currently handled. Doing it in cat_val() would be relying less on 
> ->setup(). If that's the preferred direction, then the question becomes, 
> should CAT test do anything in ->setup() because also the schemata 
> writing could be done in directly cat_val().
> 
> What I would prefer not to do is to have a rule which says: if there's a 
> test-specific function, don't use ->setup() but do any setup directly 
> in the test-specific function but, otherwise use ->setup(). Such an
> inconsistency would make things hard to track.

The test specific function can still call a setup function but it
can be done directly instead of via "struct resctrl_val_param". The
test specific function already transitioned away from using resctrl_val(),
it is not clear to me why there should be rules about how
function pointers within "struct resctrl_val_param" should be used or
indeed why "struct resctrl_val_param" should be used at all.

Reinette
Ilpo Järvinen April 27, 2023, 8:04 a.m. UTC | #4
On Wed, 26 Apr 2023, Reinette Chatre wrote:
> On 4/26/2023 6:58 AM, Ilpo Järvinen wrote:
> > On Fri, 21 Apr 2023, Reinette Chatre wrote:
> >> On 4/18/2023 4:45 AM, Ilpo Järvinen wrote:
> 
> ...
> 
> >>> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> >>> index 4b505fdb35d7..85053829b9c5 100644
> >>> --- a/tools/testing/selftests/resctrl/cat_test.c
> >>> +++ b/tools/testing/selftests/resctrl/cat_test.c
> >>> @@ -11,11 +11,12 @@
> >>>  #include "resctrl.h"
> >>>  #include <unistd.h>
> >>>  
> >>> -#define RESULT_FILE_NAME1	"result_cat1"
> >>> -#define RESULT_FILE_NAME2	"result_cat2"
> >>> -#define NUM_OF_RUNS		5
> >>> -#define MAX_DIFF_PERCENT	4
> >>> -#define MAX_DIFF		1000000
> >>> +#define RESULT_FILE_NAME		"result_cat"
> >>> +#define NUM_OF_RUNS			5
> >>> +#define MIN_DIFF_PERCENT_PER_BIT	2
> >>
> >> Could you please start a new trend that adds documentation
> >> that explains what this constant means and how it was chosen?
> > 
> > I can try although that particular 2 was a bit handwavy that just seems to 
> > work with the tests I performed.
> 
> The changelog claims that the existing CAT test does not work with
> this new test offered as replacement. Considering that I do think it
> is important to have confidence that this test is able to test CAT.
> The words "handwave" and "seems to work" are red flags to me.
> When merged, these tests will be run on a variety of platforms with
> various configurations. Using test criteria based on measurements
> from one particular system may work but there needs to be confidence
> that the criteria maps to all systems these tests will be run on.

My "tests" (in plural) were not limited to one particular system but 
included systems from different generations.

> >>> +static unsigned long current_mask;
> >>> +static long prev_avg_llc_val;
> >>>  
> >>>  /*
> >>>   * Change schemata. Write schemata to specified
> >>> @@ -28,13 +29,24 @@ static int cat_setup(struct resctrl_val_param *p)
> >>>  	int ret = 0;
> >>>  
> >>>  	/* Run NUM_OF_RUNS times */
> >>> -	if (p->num_of_runs >= NUM_OF_RUNS)
> >>> -		return END_OF_TESTS;
> >>> +	if (p->num_of_runs >= NUM_OF_RUNS) {
> >>> +		/* Remove one bit from the consecutive block */
> >>> +		current_mask &= current_mask >> 1;
> >>> +		if (!current_mask)
> >>> +			return END_OF_TESTS;
> >>> +
> >>> +		p->num_of_runs = 0;
> >>
> >> This seems like a workaround to get the schemata to be written. It is
> >> problematic since now p->num_of_runs no longer accurately reflects the
> >> number of test runs.
> > 
> > This is already the case. MBA test works around this very same problem by 
> > using a custom static variable (runs_per_allocation) which is reset to 0 
> > every NUM_OF_RUNS tests and not keeping ->num_of_runs at all. If MBA test 
> > would replace runs_per_allocation with use of ->num_of_runs, it would 
> > match what the new CAT test does.
> > 
> > Nothing currently relies on ->num_of_runs counting across the different 
> > "tests" that are run inside CAT and MBA tests. And I don't have anything 
> > immediately around the corner that would require ->num_of_runs to count 
> > total number of repetitions that were ran.
> > 
> > I guess it would be possible to attempt to consolidate that second layer
> > MBA and the rewritten CAT tests need somehow into resctrl_val_param. But 
> > IMHO that too is low-prio refactor as nothing is broken as is.
> 
> I do not think that I would use any of the other tests as reference
> since all the other tests rely on the same wrapper (resctrl_val())
> by providing it their own customization (via aptly named ... struct
> resctrl_val_param). 

Oh, I see. I never made the connection to the function name before this.
(To be honest, it's pretty stupid name for that particular function,
given what the function does, but that's an entirely separate issue.)
Reinette Chatre April 27, 2023, 3:15 p.m. UTC | #5
Hi Ilpo,

On 4/27/2023 1:04 AM, Ilpo Järvinen wrote:
> On Wed, 26 Apr 2023, Reinette Chatre wrote:
>> On 4/26/2023 6:58 AM, Ilpo Järvinen wrote:
>>> On Fri, 21 Apr 2023, Reinette Chatre wrote:
>>>> On 4/18/2023 4:45 AM, Ilpo Järvinen wrote:
>>
>> ...
>>
>>>>> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
>>>>> index 4b505fdb35d7..85053829b9c5 100644
>>>>> --- a/tools/testing/selftests/resctrl/cat_test.c
>>>>> +++ b/tools/testing/selftests/resctrl/cat_test.c
>>>>> @@ -11,11 +11,12 @@
>>>>>  #include "resctrl.h"
>>>>>  #include <unistd.h>
>>>>>  
>>>>> -#define RESULT_FILE_NAME1	"result_cat1"
>>>>> -#define RESULT_FILE_NAME2	"result_cat2"
>>>>> -#define NUM_OF_RUNS		5
>>>>> -#define MAX_DIFF_PERCENT	4
>>>>> -#define MAX_DIFF		1000000
>>>>> +#define RESULT_FILE_NAME		"result_cat"
>>>>> +#define NUM_OF_RUNS			5
>>>>> +#define MIN_DIFF_PERCENT_PER_BIT	2
>>>>
>>>> Could you please start a new trend that adds documentation
>>>> that explains what this constant means and how it was chosen?
>>>
>>> I can try although that particular 2 was a bit handwavy that just seems to 
>>> work with the tests I performed.
>>
>> The changelog claims that the existing CAT test does not work with
>> this new test offered as replacement. Considering that I do think it
>> is important to have confidence that this test is able to test CAT.
>> The words "handwave" and "seems to work" are red flags to me.
>> When merged, these tests will be run on a variety of platforms with
>> various configurations. Using test criteria based on measurements
>> from one particular system may work but there needs to be confidence
>> that the criteria maps to all systems these tests will be run on.
> 
> My "tests" (in plural) were not limited to one particular system but 
> included systems from different generations.
> 

Thank you very much for your thorough testing. Having this information
accompany this change will surely help to increase confidence in the
value chosen. 

Thank you very much

Reinette
diff mbox series

Patch

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index 7970239413da..64f08ba5edc2 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -224,10 +224,10 @@  int measure_llc_resctrl(struct resctrl_val_param *param, int bm_pid)
  */
 int cat_val(struct resctrl_val_param *param)
 {
-	int memflush = 1, operation = 0, ret = 0;
 	char *resctrl_val = param->resctrl_val;
 	unsigned long llc_perf_miss = 0;
 	pid_t bm_pid;
+	int ret;
 
 	if (strcmp(param->filename, "") == 0)
 		sprintf(param->filename, "stdio");
@@ -245,6 +245,10 @@  int cat_val(struct resctrl_val_param *param)
 	if (ret)
 		return ret;
 
+	ret = alloc_buffer(param->span, 1);
+	if (ret)
+		return ret;
+
 	initialize_llc_perf();
 
 	/* Test runs until the callback setup() tells the test to stop. */
@@ -256,17 +260,15 @@  int cat_val(struct resctrl_val_param *param)
 		}
 		if (ret < 0)
 			break;
+
+		flush_buffer(param->span);
+		use_buffer(param->span, 0, true);
+
 		ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
 		if (ret)
 			break;
 
-		if (run_fill_buf(param->span, memflush, operation, true)) {
-			fprintf(stderr, "Error-running fill buffer\n");
-			ret = -1;
-			break;
-		}
-
-		sleep(1);
+		use_buffer(param->span, 0, true);
 
 		/* Measure cache miss from perf */
 		ret = get_llc_perf(&llc_perf_miss);
@@ -279,6 +281,8 @@  int cat_val(struct resctrl_val_param *param)
 			break;
 	}
 
+	free_buffer();
+
 	return ret;
 }
 
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 4b505fdb35d7..85053829b9c5 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -11,11 +11,12 @@ 
 #include "resctrl.h"
 #include <unistd.h>
 
-#define RESULT_FILE_NAME1	"result_cat1"
-#define RESULT_FILE_NAME2	"result_cat2"
-#define NUM_OF_RUNS		5
-#define MAX_DIFF_PERCENT	4
-#define MAX_DIFF		1000000
+#define RESULT_FILE_NAME		"result_cat"
+#define NUM_OF_RUNS			5
+#define MIN_DIFF_PERCENT_PER_BIT	2
+
+static unsigned long current_mask;
+static long prev_avg_llc_val;
 
 /*
  * Change schemata. Write schemata to specified
@@ -28,13 +29,24 @@  static int cat_setup(struct resctrl_val_param *p)
 	int ret = 0;
 
 	/* Run NUM_OF_RUNS times */
-	if (p->num_of_runs >= NUM_OF_RUNS)
-		return END_OF_TESTS;
+	if (p->num_of_runs >= NUM_OF_RUNS) {
+		/* Remove one bit from the consecutive block */
+		current_mask &= current_mask >> 1;
+		if (!current_mask)
+			return END_OF_TESTS;
+
+		p->num_of_runs = 0;
+	}
 
 	if (p->num_of_runs == 0) {
-		sprintf(schemata, "%lx", p->mask);
-		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
-				     p->resctrl_val);
+		snprintf(schemata, sizeof(schemata), "%lx", p->mask & ~current_mask);
+		ret = write_schemata("", schemata, p->cpu_no, p->resctrl_val);
+		if (ret)
+			return ret;
+		snprintf(schemata, sizeof(schemata), "%lx", current_mask);
+		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no, p->resctrl_val);
+		if (ret)
+			return ret;
 	}
 	p->num_of_runs++;
 
@@ -42,34 +54,41 @@  static int cat_setup(struct resctrl_val_param *p)
 }
 
 static int show_results_info(unsigned long sum_llc_val, int no_of_bits,
-			     unsigned long cache_span, unsigned long max_diff,
-			     unsigned long max_diff_percent, unsigned long num_of_runs,
-			     bool platform)
+			     unsigned long cache_span, long min_diff_percent,
+			     unsigned long num_of_runs, bool platform)
 {
-	unsigned long avg_llc_val = 0;
-	float diff_percent;
-	int ret;
+	long avg_llc_val = 0;
+	int avg_diff_per;
+	float avg_diff;
+	int ret = 0;
 
 	avg_llc_val = sum_llc_val / num_of_runs;
-	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
+	avg_diff = (float)(avg_llc_val - prev_avg_llc_val) / prev_avg_llc_val;
+	avg_diff_per = (int)(avg_diff * 100);
 
-	ret = platform && abs((int)diff_percent) > max_diff_percent;
+	if (prev_avg_llc_val) {
+		ret = platform && avg_diff_per < min_diff_percent;
 
-	ksft_print_msg("%s Check cache miss rate within %d%%\n",
-		       ret ? "Fail:" : "Pass:", max_diff_percent);
+		ksft_print_msg("%s Check cache miss rate changed more than %d%%\n",
+			       ret ? "Fail:" : "Pass:", min_diff_percent);
 
-	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+		ksft_print_msg("Percent diff=%d\n", avg_diff_per);
+	}
+	prev_avg_llc_val = avg_llc_val;
 
 	show_cache_info(no_of_bits, avg_llc_val, cache_span, true);
 
 	return ret;
 }
 
-static int check_results(struct resctrl_val_param *param)
+static int check_results(struct resctrl_val_param *param, char *cache_type)
 {
 	char *token_array[8], temp[512];
 	unsigned long sum_llc_perf_miss = 0;
-	int runs = 0, no_of_bits = 0;
+	unsigned long alloc_size;
+	int runs = 0;
+	int fail = 0;
+	int ret;
 	FILE *fp;
 
 	ksft_print_msg("Checking for pass/fail\n");
@@ -83,42 +102,59 @@  static int check_results(struct resctrl_val_param *param)
 	while (fgets(temp, sizeof(temp), fp)) {
 		char *token = strtok(temp, ":\t");
 		int fields = 0;
+		int bits;
 
 		while (token) {
 			token_array[fields++] = token;
 			token = strtok(NULL, ":\t");
 		}
-		/*
-		 * Discard the first value which is inaccurate due to monitoring
-		 * setup transition phase.
-		 */
-		if (runs > 0)
-			sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
+
+		sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
 		runs++;
+
+		if (runs < NUM_OF_RUNS)
+			continue;
+
+		if (!current_mask) {
+			ksft_print_msg("Unexpected empty cache mask\n");
+			break;
+		}
+
+		ret = cache_alloc_size(param->cpu_no, cache_type, current_mask, &alloc_size);
+		if (ret)
+			return ret;
+
+		bits = count_bits(current_mask);
+
+		ret = show_results_info(sum_llc_perf_miss, bits,
+					alloc_size / 64,
+					MIN_DIFF_PERCENT_PER_BIT * bits, runs,
+					get_vendor() == ARCH_INTEL);
+		if (ret)
+			fail = 1;
+
+		runs = 0;
+		sum_llc_perf_miss = 0;
+		current_mask &= current_mask >> 1;
 	}
 
 	fclose(fp);
-	no_of_bits = count_consecutive_bits(param->mask, NULL);
 
-	return show_results_info(sum_llc_perf_miss, no_of_bits, param->span / 64,
-				 MAX_DIFF, MAX_DIFF_PERCENT, runs - 1,
-				 get_vendor() == ARCH_INTEL);
+	return fail;
 }
 
 void cat_test_cleanup(void)
 {
-	remove(RESULT_FILE_NAME1);
-	remove(RESULT_FILE_NAME2);
+	remove(RESULT_FILE_NAME);
 }
 
 int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 {
-	unsigned long l_mask, l_mask_1;
-	int ret, pipefd[2], sibling_cpu_no;
 	unsigned long cache_size;
 	unsigned long long_mask;
+	unsigned int start;
 	int count_of_bits;
-	char pipe_message;
+	int ret;
 
 	cache_size = 0;
 
@@ -126,7 +162,7 @@  int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 	ret = get_mask_no_shareable(cache_type, &long_mask);
 	if (ret)
 		return ret;
-	count_of_bits = count_consecutive_bits(long_mask, NULL);
+	count_of_bits = count_consecutive_bits(long_mask, &start);
 
 	/* Get L3/L2 cache size */
 	ret = get_cache_size(cpu_no, cache_type, &cache_size);
@@ -143,99 +179,29 @@  int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
 			       count_of_bits - 1);
 		return -1;
 	}
-
-	/* Get core id from same socket for running another thread */
-	sibling_cpu_no = get_core_sibling(cpu_no);
-	if (sibling_cpu_no < 0)
-		return -1;
+	current_mask = create_bit_mask(start, n);
 
 	struct resctrl_val_param param = {
 		.resctrl_val	= CAT_STR,
 		.cpu_no		= cpu_no,
+		.ctrlgrp	= "c1",
 		.setup		= cat_setup,
+		.filename	= RESULT_FILE_NAME,
+		.num_of_runs	= 0,
 	};
-
-	l_mask = long_mask >> n;
-	l_mask_1 = ~l_mask & long_mask;
-
-	/* Set param values for parent thread which will be allocated bitmask
-	 * with (max_bits - n) bits
-	 */
-	ret = cache_alloc_size(cpu_no, cache_type, l_mask, &param.span);
+	param.mask = long_mask;
+	ret = cache_alloc_size(cpu_no, cache_type, current_mask, &param.span);
 	if (ret)
 		return ret;
-	strcpy(param.ctrlgrp, "c2");
-	strcpy(param.mongrp, "m2");
-	strcpy(param.filename, RESULT_FILE_NAME2);
-	param.mask = l_mask;
-	param.num_of_runs = 0;
-
-	if (pipe(pipefd)) {
-		perror("# Unable to create pipe");
-		return errno;
-	}
-
-	fflush(stdout);
-	bm_pid = fork();
-
-	/* Set param values for child thread which will be allocated bitmask
-	 * with n bits
-	 */
-	if (bm_pid == 0) {
-		param.mask = l_mask_1;
-		strcpy(param.ctrlgrp, "c1");
-		strcpy(param.mongrp, "m1");
-		ret = cache_alloc_size(cpu_no, cache_type, l_mask_1, &param.span);
-		if (ret)
-			exit(-1);
-		strcpy(param.filename, RESULT_FILE_NAME1);
-		param.num_of_runs = 0;
-		param.cpu_no = sibling_cpu_no;
-	} else {
-		ret = signal_handler_register();
-		if (ret) {
-			kill(bm_pid, SIGKILL);
-			goto out;
-		}
-	}
 
 	remove(param.filename);
 
 	ret = cat_val(&param);
-	if (ret == 0)
-		ret = check_results(&param);
-
-	if (bm_pid == 0) {
-		/* Tell parent that child is ready */
-		close(pipefd[0]);
-		pipe_message = 1;
-		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
-		    sizeof(pipe_message))
-			/*
-			 * Just print the error message.
-			 * Let while(1) run and wait for itself to be killed.
-			 */
-			perror("# failed signaling parent process");
-
-		close(pipefd[1]);
-		while (1)
-			;
-	} else {
-		/* Parent waits for child to be ready. */
-		close(pipefd[1]);
-		pipe_message = 0;
-		while (pipe_message != 1) {
-			if (read(pipefd[0], &pipe_message,
-				 sizeof(pipe_message)) < sizeof(pipe_message)) {
-				perror("# failed reading from child process");
-				break;
-			}
-		}
-		close(pipefd[0]);
-		kill(bm_pid, SIGKILL);
-		signal_handler_unregister();
-	}
+	if (ret)
+		goto out;
 
+	current_mask = create_bit_mask(start, n);
+	ret = check_results(&param, cache_type);
 out:
 	cat_test_cleanup();