diff mbox

[i-g-t,17/18] stats: Add support for quartiles (and thus median)

Message ID 1435417696-28115-18-git-send-email-damien.lespiau@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lespiau, Damien June 27, 2015, 3:08 p.m. UTC
More stuff, quite useful characteristics of a dataset.

Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
---
 lib/igt_stats.c       | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/igt_stats.h       |   5 ++
 lib/tests/igt_stats.c |  41 ++++++++++++++++
 3 files changed, 177 insertions(+)

Comments

Chris Wilson June 27, 2015, 3:15 p.m. UTC | #1
On Sat, Jun 27, 2015 at 04:08:15PM +0100, Damien Lespiau wrote:
> More stuff, quite useful characteristics of a dataset.
> 
> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
> ---
>  lib/igt_stats.c       | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  lib/igt_stats.h       |   5 ++
>  lib/tests/igt_stats.c |  41 ++++++++++++++++
>  3 files changed, 177 insertions(+)
> 
> diff --git a/lib/igt_stats.c b/lib/igt_stats.c
> index 5c1f23f..66fd563 100644
> --- a/lib/igt_stats.c
> +++ b/lib/igt_stats.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include <math.h>
> +#include <stdlib.h>
>  #include <string.h>
>  
>  #include "igt_core.h"
> @@ -94,6 +95,7 @@ void igt_stats_init(igt_stats_t *stats, unsigned int capacity)
>  void igt_stats_fini(igt_stats_t *stats)
>  {
>  	free(stats->values);
> +	free(stats->sorted);
>  }
>  
>  
> @@ -158,6 +160,7 @@ void igt_stats_push(igt_stats_t *stats, uint64_t value)
>  	stats->values[stats->n_values++] = value;
>  
>  	stats->mean_variance_valid = false;
> +	stats->sorted_array_valid = false;
>  
>  	if (value < stats->min)
>  		stats->min = value;
> @@ -220,6 +223,134 @@ uint64_t igt_stats_get_range(igt_stats_t *stats)
>  	return igt_stats_get_max(stats) - igt_stats_get_min(stats);
>  }
>  
> +static int cmp_u64(const void *pa, const void *pb)
> +{
> +	const uint64_t *a = pa, *b = pb;
> +
> +	if (*a < *b)
> +		return -1;
> +	if (*a > *b)
> +		return 1;
> +	return 0;
> +}
> +
> +static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
> +{
> +	if (stats->sorted_array_valid)
> +		return;
> +
> +	if (!stats->sorted) {
> +		stats->sorted = calloc(stats->capacity, sizeof(*stats->values));
> +		igt_assert(stats->sorted);
> +	}

Since sorted_array_valid = false on igt_stats_push, but we only allocate
for the first call to sort, there's no safeguard against

  igt_stats_push();
  igt_stats_get_median();
  igt_stats_push();
  igt_stats_get_median();

exploding.
-Chris
> +
> +	memcpy(stats->sorted, stats->values,
> +	       sizeof(*stats->values) * stats->n_values);
> +
> +	qsort(stats->sorted, stats->n_values, sizeof(*stats->values), cmp_u64);
> +
> +	stats->sorted_array_valid = true;
> +}
> +
> +/*
> + * We use Tukey's hinge for our quartiles determination.
> + * ends (end, lower_end) are exclusive.
> + */
> +static double
> +igt_stats_get_median_internal(igt_stats_t *stats,
> +			      unsigned int start, unsigned int end,
> +			      unsigned int *lower_end /* out */,
> +			      unsigned int *upper_start /* out */)
> +{
> +	unsigned int mid, n_values = end - start;
> +	double median;
> +
> +	igt_stats_ensure_sorted_values(stats);
> +
> +	/* odd number of data points */
> +	if (n_values % 2 == 1) {
> +		/* median is the value in the middle (actual datum) */
> +		mid = start + n_values / 2;
> +		median = stats->sorted[mid];
> +
> +		/* the two halves contain the median value */
> +		if (lower_end)
> +			*lower_end = mid + 1;
> +		if (upper_start)
> +			*upper_start = mid;
> +
> +	/* even number of data points */
> +	} else {
> +		/*
> +		 * The middle is in between two indexes, 'mid' points at the
> +		 * lower one. The median is then the average between those two
> +		 * values.
> +		 */
> +		mid = start + n_values / 2 - 1;
> +		median = (stats->sorted[mid] + stats->sorted[mid + 1]) / 2.;
> +
> +		if (lower_end)
> +			*lower_end = mid + 1;
> +		if (upper_start)
> +			*upper_start = mid + 1;
> +	}
> +
> +	return median;
> +}
> +
> +/**
> + * igt_stats_get_quartiles:
> + * @stats: An #igt_stats_t instance
> + * @q1: (out): lower or 25th quartile
> + * @q2: (out): median or 50th quartile
> + * @q3: (out): upper or 75th quartile
> + *
> + * Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the
> + * @stats dataset.
> + */
> +void igt_stats_get_quartiles(igt_stats_t *stats,
> +			     double *q1, double *q2, double *q3)
> +{
> +	unsigned int lower_end, upper_start;
> +	double ret;
> +
> +	if (stats->n_values < 3) {
> +		if (q1)
> +			*q1 = 0.;
> +		if (q2)
> +			*q2 = 0.;
> +		if (q3)
> +			*q3 = 0.;
> +		return;
> +	}
> +
> +	ret = igt_stats_get_median_internal(stats, 0, stats->n_values,
> +					    &lower_end, &upper_start);
> +	if (q2)
> +		*q2 = ret;
> +
> +	ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL);
> +	if (q1)
> +		*q1 = ret;
> +
> +	ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values,
> +					    NULL, NULL);
> +	if (q3)
> +		*q3 = ret;

That's neat, I like that determination of the quartiles a lot.
-Chris
Lespiau, Damien June 27, 2015, 3:52 p.m. UTC | #2
On Sat, Jun 27, 2015 at 04:15:41PM +0100, Chris Wilson wrote:
> > +static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
> > +{
> > +	if (stats->sorted_array_valid)
> > +		return;
> > +
> > +	if (!stats->sorted) {
> > +		stats->sorted = calloc(stats->capacity, sizeof(*stats->values));
> > +		igt_assert(stats->sorted);
> > +	}
> 
> Since sorted_array_valid = false on igt_stats_push, but we only allocate
> for the first call to sort, there's no safeguard against
> 
>   igt_stats_push();
>   igt_stats_get_median();
>   igt_stats_push();
>   igt_stats_get_median();
> 
> exploding.

I believe it's fine (and I just sent a test telling the same story, but
well, overflowing an small heap allocated array can often be "fine" with
an overallocation from malloc() rounding up the size to git in a bin),
as the size of the dataset is an invariant (I didn't implement your
suggestion to grow the array at push() time). We allocate the full
capacity here, not just the current n_values.

I did have a think about a growing ->values array, but I think we know
the number of data points we want upfront most of the time (if not
always) when going to do some measurements?
Chris Wilson June 27, 2015, 3:56 p.m. UTC | #3
On Sat, Jun 27, 2015 at 04:52:56PM +0100, Damien Lespiau wrote:
> On Sat, Jun 27, 2015 at 04:15:41PM +0100, Chris Wilson wrote:
> > > +static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
> > > +{
> > > +	if (stats->sorted_array_valid)
> > > +		return;
> > > +
> > > +	if (!stats->sorted) {
> > > +		stats->sorted = calloc(stats->capacity, sizeof(*stats->values));
> > > +		igt_assert(stats->sorted);
> > > +	}
> > 
> > Since sorted_array_valid = false on igt_stats_push, but we only allocate
> > for the first call to sort, there's no safeguard against
> > 
> >   igt_stats_push();
> >   igt_stats_get_median();
> >   igt_stats_push();
> >   igt_stats_get_median();
> > 
> > exploding.
> 
> I believe it's fine (and I just sent a test telling the same story, but
> well, overflowing an small heap allocated array can often be "fine" with
> an overallocation from malloc() rounding up the size to git in a bin),
> as the size of the dataset is an invariant (I didn't implement your
> suggestion to grow the array at push() time). We allocate the full
> capacity here, not just the current n_values.

Ah, didn't realise that the capacity was fixed.
 
> I did have a think about a growing ->values array, but I think we know
> the number of data points we want upfront most of the time (if not
> always) when going to do some measurements?

No, often times I want to run a benchmark for x seconds and then look at
the spread of samples.
-Chris
Lespiau, Damien June 27, 2015, 4:04 p.m. UTC | #4
On Sat, Jun 27, 2015 at 04:56:54PM +0100, Chris Wilson wrote:
> > I did have a think about a growing ->values array, but I think we know
> > the number of data points we want upfront most of the time (if not
> > always) when going to do some measurements?
> 
> No, often times I want to run a benchmark for x seconds and then look at
> the spread of samples.

Ah, fair enough, will make push() grow the array and corresponding
adjustments then.
diff mbox

Patch

diff --git a/lib/igt_stats.c b/lib/igt_stats.c
index 5c1f23f..66fd563 100644
--- a/lib/igt_stats.c
+++ b/lib/igt_stats.c
@@ -23,6 +23,7 @@ 
  */
 
 #include <math.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "igt_core.h"
@@ -94,6 +95,7 @@  void igt_stats_init(igt_stats_t *stats, unsigned int capacity)
 void igt_stats_fini(igt_stats_t *stats)
 {
 	free(stats->values);
+	free(stats->sorted);
 }
 
 
@@ -158,6 +160,7 @@  void igt_stats_push(igt_stats_t *stats, uint64_t value)
 	stats->values[stats->n_values++] = value;
 
 	stats->mean_variance_valid = false;
+	stats->sorted_array_valid = false;
 
 	if (value < stats->min)
 		stats->min = value;
@@ -220,6 +223,134 @@  uint64_t igt_stats_get_range(igt_stats_t *stats)
 	return igt_stats_get_max(stats) - igt_stats_get_min(stats);
 }
 
+static int cmp_u64(const void *pa, const void *pb)
+{
+	const uint64_t *a = pa, *b = pb;
+
+	if (*a < *b)
+		return -1;
+	if (*a > *b)
+		return 1;
+	return 0;
+}
+
+static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
+{
+	if (stats->sorted_array_valid)
+		return;
+
+	if (!stats->sorted) {
+		stats->sorted = calloc(stats->capacity, sizeof(*stats->values));
+		igt_assert(stats->sorted);
+	}
+
+	memcpy(stats->sorted, stats->values,
+	       sizeof(*stats->values) * stats->n_values);
+
+	qsort(stats->sorted, stats->n_values, sizeof(*stats->values), cmp_u64);
+
+	stats->sorted_array_valid = true;
+}
+
+/*
+ * We use Tukey's hinge for our quartiles determination.
+ * ends (end, lower_end) are exclusive.
+ */
+static double
+igt_stats_get_median_internal(igt_stats_t *stats,
+			      unsigned int start, unsigned int end,
+			      unsigned int *lower_end /* out */,
+			      unsigned int *upper_start /* out */)
+{
+	unsigned int mid, n_values = end - start;
+	double median;
+
+	igt_stats_ensure_sorted_values(stats);
+
+	/* odd number of data points */
+	if (n_values % 2 == 1) {
+		/* median is the value in the middle (actual datum) */
+		mid = start + n_values / 2;
+		median = stats->sorted[mid];
+
+		/* the two halves contain the median value */
+		if (lower_end)
+			*lower_end = mid + 1;
+		if (upper_start)
+			*upper_start = mid;
+
+	/* even number of data points */
+	} else {
+		/*
+		 * The middle is in between two indexes, 'mid' points at the
+		 * lower one. The median is then the average between those two
+		 * values.
+		 */
+		mid = start + n_values / 2 - 1;
+		median = (stats->sorted[mid] + stats->sorted[mid + 1]) / 2.;
+
+		if (lower_end)
+			*lower_end = mid + 1;
+		if (upper_start)
+			*upper_start = mid + 1;
+	}
+
+	return median;
+}
+
+/**
+ * igt_stats_get_quartiles:
+ * @stats: An #igt_stats_t instance
+ * @q1: (out): lower or 25th quartile
+ * @q2: (out): median or 50th quartile
+ * @q3: (out): upper or 75th quartile
+ *
+ * Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the
+ * @stats dataset.
+ */
+void igt_stats_get_quartiles(igt_stats_t *stats,
+			     double *q1, double *q2, double *q3)
+{
+	unsigned int lower_end, upper_start;
+	double ret;
+
+	if (stats->n_values < 3) {
+		if (q1)
+			*q1 = 0.;
+		if (q2)
+			*q2 = 0.;
+		if (q3)
+			*q3 = 0.;
+		return;
+	}
+
+	ret = igt_stats_get_median_internal(stats, 0, stats->n_values,
+					    &lower_end, &upper_start);
+	if (q2)
+		*q2 = ret;
+
+	ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL);
+	if (q1)
+		*q1 = ret;
+
+	ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values,
+					    NULL, NULL);
+	if (q3)
+		*q3 = ret;
+}
+
+/**
+ * igt_stats_get_median:
+ * @stats: An #igt_stats_t instance
+ *
+ * Retrieves the median of the @stats dataset.
+ */
+double igt_stats_get_median(igt_stats_t *stats)
+{
+	return igt_stats_get_median_internal(stats, 0, stats->n_values,
+					     NULL, NULL);
+}
+
 /*
  * Algorithm popularised by Knuth in:
  *
diff --git a/lib/igt_stats.h b/lib/igt_stats.h
index 46437c4..887fa79 100644
--- a/lib/igt_stats.h
+++ b/lib/igt_stats.h
@@ -41,8 +41,10 @@  typedef struct {
 	unsigned int capacity;
 	unsigned int is_population  : 1;
 	unsigned int mean_variance_valid : 1;
+	unsigned int sorted_array_valid : 1;
 	uint64_t min, max;
 	double mean, variance;
+	uint64_t *sorted;
 } igt_stats_t;
 
 void igt_stats_init(igt_stats_t *stats, unsigned int capacity);
@@ -55,7 +57,10 @@  void igt_stats_push_array(igt_stats_t *stats,
 uint64_t igt_stats_get_min(igt_stats_t *stats);
 uint64_t igt_stats_get_max(igt_stats_t *stats);
 uint64_t igt_stats_get_range(igt_stats_t *stats);
+void igt_stats_get_quartiles(igt_stats_t *stats,
+			     double *q1, double *q2, double *q3);
 double igt_stats_get_mean(igt_stats_t *stats);
+double igt_stats_get_median(igt_stats_t *stats);
 double igt_stats_get_variance(igt_stats_t *stats);
 double igt_stats_get_std_deviation(igt_stats_t *stats);
 
diff --git a/lib/tests/igt_stats.c b/lib/tests/igt_stats.c
index 468bba2..c4c6776 100644
--- a/lib/tests/igt_stats.c
+++ b/lib/tests/igt_stats.c
@@ -25,6 +25,8 @@ 
 #include "igt_core.h"
 #include "igt_stats.h"
 
+#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0]))
+
 static void push_fixture_1(igt_stats_t *stats)
 {
 	igt_stats_push(stats, 2);
@@ -78,6 +80,44 @@  static void test_range(void)
 	igt_assert(igt_stats_get_range(&stats) == 8);
 }
 
+/*
+ * Examples taken from: https://en.wikipedia.org/wiki/Quartile
+ * The values are shifted a bit to test we do indeed start by sorting data
+ * set.
+ */
+static void test_quartiles(void)
+{
+	static const uint64_t s1[] =
+		{ 47, 49, 6, 7, 15, 36, 39, 40, 41, 42, 43 };
+	static const uint64_t s2[] = { 40, 41, 7, 15, 36, 39 };
+	igt_stats_t stats;
+	double q1, q2, q3;
+
+	/* s1, odd number of data points */
+	igt_stats_init(&stats, ARRAY_SIZE(s1));
+	igt_stats_push_array(&stats, s1, ARRAY_SIZE(s1));
+
+	igt_stats_get_quartiles(&stats, &q1, &q2, &q3);
+	igt_assert_eq_double(q1, 25.5);
+	igt_assert_eq_double(q2, 40);
+	igt_assert_eq_double(q3, 42.5);
+	igt_assert_eq_double(igt_stats_get_median(&stats), 40);
+
+	igt_stats_fini(&stats);
+
+	/* s1, even number of data points */
+	igt_stats_init(&stats, ARRAY_SIZE(s2));
+	igt_stats_push_array(&stats, s2, ARRAY_SIZE(s2));
+
+	igt_stats_get_quartiles(&stats, &q1, &q2, &q3);
+	igt_assert_eq_double(q1, 15);
+	igt_assert_eq_double(q2, 37.5);
+	igt_assert_eq_double(q3, 40);
+	igt_assert_eq_double(igt_stats_get_median(&stats), 37.5);
+
+	igt_stats_fini(&stats);
+}
+
 static void test_mean(void)
 {
 	igt_stats_t stats;
@@ -150,6 +190,7 @@  igt_simple_main
 	test_init();
 	test_min_max();
 	test_range();
+	test_quartiles();
 	test_mean();
 	test_invalidate_mean();
 	test_std_deviation();