Message ID | 1435417696-28115-18-git-send-email-damien.lespiau@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, Jun 27, 2015 at 04:08:15PM +0100, Damien Lespiau wrote: > More stuff, quite useful characteristics of a dataset. > > Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> > --- > lib/igt_stats.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++ > lib/igt_stats.h | 5 ++ > lib/tests/igt_stats.c | 41 ++++++++++++++++ > 3 files changed, 177 insertions(+) > > diff --git a/lib/igt_stats.c b/lib/igt_stats.c > index 5c1f23f..66fd563 100644 > --- a/lib/igt_stats.c > +++ b/lib/igt_stats.c > @@ -23,6 +23,7 @@ > */ > > #include <math.h> > +#include <stdlib.h> > #include <string.h> > > #include "igt_core.h" > @@ -94,6 +95,7 @@ void igt_stats_init(igt_stats_t *stats, unsigned int capacity) > void igt_stats_fini(igt_stats_t *stats) > { > free(stats->values); > + free(stats->sorted); > } > > > @@ -158,6 +160,7 @@ void igt_stats_push(igt_stats_t *stats, uint64_t value) > stats->values[stats->n_values++] = value; > > stats->mean_variance_valid = false; > + stats->sorted_array_valid = false; > > if (value < stats->min) > stats->min = value; > @@ -220,6 +223,134 @@ uint64_t igt_stats_get_range(igt_stats_t *stats) > return igt_stats_get_max(stats) - igt_stats_get_min(stats); > } > > +static int cmp_u64(const void *pa, const void *pb) > +{ > + const uint64_t *a = pa, *b = pb; > + > + if (*a < *b) > + return -1; > + if (*a > *b) > + return 1; > + return 0; > +} > + > +static void igt_stats_ensure_sorted_values(igt_stats_t *stats) > +{ > + if (stats->sorted_array_valid) > + return; > + > + if (!stats->sorted) { > + stats->sorted = calloc(stats->capacity, sizeof(*stats->values)); > + igt_assert(stats->sorted); > + } Since sorted_array_valid = false on igt_stats_push, but we only allocate for the first call to sort, there's no safeguard against igt_stats_push(); igt_stats_get_median(); igt_stats_push(); igt_stats_get_median(); exploding. -Chris > + > + memcpy(stats->sorted, stats->values, > + sizeof(*stats->values) * stats->n_values); > + > + qsort(stats->sorted, stats->n_values, sizeof(*stats->values), cmp_u64); > + > + stats->sorted_array_valid = true; > +} > + > +/* > + * We use Tukey's hinge for our quartiles determination. > + * ends (end, lower_end) are exclusive. > + */ > +static double > +igt_stats_get_median_internal(igt_stats_t *stats, > + unsigned int start, unsigned int end, > + unsigned int *lower_end /* out */, > + unsigned int *upper_start /* out */) > +{ > + unsigned int mid, n_values = end - start; > + double median; > + > + igt_stats_ensure_sorted_values(stats); > + > + /* odd number of data points */ > + if (n_values % 2 == 1) { > + /* median is the value in the middle (actual datum) */ > + mid = start + n_values / 2; > + median = stats->sorted[mid]; > + > + /* the two halves contain the median value */ > + if (lower_end) > + *lower_end = mid + 1; > + if (upper_start) > + *upper_start = mid; > + > + /* even number of data points */ > + } else { > + /* > + * The middle is in between two indexes, 'mid' points at the > + * lower one. The median is then the average between those two > + * values. > + */ > + mid = start + n_values / 2 - 1; > + median = (stats->sorted[mid] + stats->sorted[mid + 1]) / 2.; > + > + if (lower_end) > + *lower_end = mid + 1; > + if (upper_start) > + *upper_start = mid + 1; > + } > + > + return median; > +} > + > +/** > + * igt_stats_get_quartiles: > + * @stats: An #igt_stats_t instance > + * @q1: (out): lower or 25th quartile > + * @q2: (out): median or 50th quartile > + * @q3: (out): upper or 75th quartile > + * > + * Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the > + * @stats dataset. > + */ > +void igt_stats_get_quartiles(igt_stats_t *stats, > + double *q1, double *q2, double *q3) > +{ > + unsigned int lower_end, upper_start; > + double ret; > + > + if (stats->n_values < 3) { > + if (q1) > + *q1 = 0.; > + if (q2) > + *q2 = 0.; > + if (q3) > + *q3 = 0.; > + return; > + } > + > + ret = igt_stats_get_median_internal(stats, 0, stats->n_values, > + &lower_end, &upper_start); > + if (q2) > + *q2 = ret; > + > + ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL); > + if (q1) > + *q1 = ret; > + > + ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values, > + NULL, NULL); > + if (q3) > + *q3 = ret; That's neat, I like that determination of the quartiles a lot. -Chris
On Sat, Jun 27, 2015 at 04:15:41PM +0100, Chris Wilson wrote: > > +static void igt_stats_ensure_sorted_values(igt_stats_t *stats) > > +{ > > + if (stats->sorted_array_valid) > > + return; > > + > > + if (!stats->sorted) { > > + stats->sorted = calloc(stats->capacity, sizeof(*stats->values)); > > + igt_assert(stats->sorted); > > + } > > Since sorted_array_valid = false on igt_stats_push, but we only allocate > for the first call to sort, there's no safeguard against > > igt_stats_push(); > igt_stats_get_median(); > igt_stats_push(); > igt_stats_get_median(); > > exploding. I believe it's fine (and I just sent a test telling the same story, but well, overflowing an small heap allocated array can often be "fine" with an overallocation from malloc() rounding up the size to git in a bin), as the size of the dataset is an invariant (I didn't implement your suggestion to grow the array at push() time). We allocate the full capacity here, not just the current n_values. I did have a think about a growing ->values array, but I think we know the number of data points we want upfront most of the time (if not always) when going to do some measurements?
On Sat, Jun 27, 2015 at 04:52:56PM +0100, Damien Lespiau wrote: > On Sat, Jun 27, 2015 at 04:15:41PM +0100, Chris Wilson wrote: > > > +static void igt_stats_ensure_sorted_values(igt_stats_t *stats) > > > +{ > > > + if (stats->sorted_array_valid) > > > + return; > > > + > > > + if (!stats->sorted) { > > > + stats->sorted = calloc(stats->capacity, sizeof(*stats->values)); > > > + igt_assert(stats->sorted); > > > + } > > > > Since sorted_array_valid = false on igt_stats_push, but we only allocate > > for the first call to sort, there's no safeguard against > > > > igt_stats_push(); > > igt_stats_get_median(); > > igt_stats_push(); > > igt_stats_get_median(); > > > > exploding. > > I believe it's fine (and I just sent a test telling the same story, but > well, overflowing an small heap allocated array can often be "fine" with > an overallocation from malloc() rounding up the size to git in a bin), > as the size of the dataset is an invariant (I didn't implement your > suggestion to grow the array at push() time). We allocate the full > capacity here, not just the current n_values. Ah, didn't realise that the capacity was fixed. > I did have a think about a growing ->values array, but I think we know > the number of data points we want upfront most of the time (if not > always) when going to do some measurements? No, often times I want to run a benchmark for x seconds and then look at the spread of samples. -Chris
On Sat, Jun 27, 2015 at 04:56:54PM +0100, Chris Wilson wrote: > > I did have a think about a growing ->values array, but I think we know > > the number of data points we want upfront most of the time (if not > > always) when going to do some measurements? > > No, often times I want to run a benchmark for x seconds and then look at > the spread of samples. Ah, fair enough, will make push() grow the array and corresponding adjustments then.
diff --git a/lib/igt_stats.c b/lib/igt_stats.c index 5c1f23f..66fd563 100644 --- a/lib/igt_stats.c +++ b/lib/igt_stats.c @@ -23,6 +23,7 @@ */ #include <math.h> +#include <stdlib.h> #include <string.h> #include "igt_core.h" @@ -94,6 +95,7 @@ void igt_stats_init(igt_stats_t *stats, unsigned int capacity) void igt_stats_fini(igt_stats_t *stats) { free(stats->values); + free(stats->sorted); } @@ -158,6 +160,7 @@ void igt_stats_push(igt_stats_t *stats, uint64_t value) stats->values[stats->n_values++] = value; stats->mean_variance_valid = false; + stats->sorted_array_valid = false; if (value < stats->min) stats->min = value; @@ -220,6 +223,134 @@ uint64_t igt_stats_get_range(igt_stats_t *stats) return igt_stats_get_max(stats) - igt_stats_get_min(stats); } +static int cmp_u64(const void *pa, const void *pb) +{ + const uint64_t *a = pa, *b = pb; + + if (*a < *b) + return -1; + if (*a > *b) + return 1; + return 0; +} + +static void igt_stats_ensure_sorted_values(igt_stats_t *stats) +{ + if (stats->sorted_array_valid) + return; + + if (!stats->sorted) { + stats->sorted = calloc(stats->capacity, sizeof(*stats->values)); + igt_assert(stats->sorted); + } + + memcpy(stats->sorted, stats->values, + sizeof(*stats->values) * stats->n_values); + + qsort(stats->sorted, stats->n_values, sizeof(*stats->values), cmp_u64); + + stats->sorted_array_valid = true; +} + +/* + * We use Tukey's hinge for our quartiles determination. + * ends (end, lower_end) are exclusive. + */ +static double +igt_stats_get_median_internal(igt_stats_t *stats, + unsigned int start, unsigned int end, + unsigned int *lower_end /* out */, + unsigned int *upper_start /* out */) +{ + unsigned int mid, n_values = end - start; + double median; + + igt_stats_ensure_sorted_values(stats); + + /* odd number of data points */ + if (n_values % 2 == 1) { + /* median is the value in the middle (actual datum) */ + mid = start + n_values / 2; + median = stats->sorted[mid]; + + /* the two halves contain the median value */ + if (lower_end) + *lower_end = mid + 1; + if (upper_start) + *upper_start = mid; + + /* even number of data points */ + } else { + /* + * The middle is in between two indexes, 'mid' points at the + * lower one. The median is then the average between those two + * values. + */ + mid = start + n_values / 2 - 1; + median = (stats->sorted[mid] + stats->sorted[mid + 1]) / 2.; + + if (lower_end) + *lower_end = mid + 1; + if (upper_start) + *upper_start = mid + 1; + } + + return median; +} + +/** + * igt_stats_get_quartiles: + * @stats: An #igt_stats_t instance + * @q1: (out): lower or 25th quartile + * @q2: (out): median or 50th quartile + * @q3: (out): upper or 75th quartile + * + * Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the + * @stats dataset. + */ +void igt_stats_get_quartiles(igt_stats_t *stats, + double *q1, double *q2, double *q3) +{ + unsigned int lower_end, upper_start; + double ret; + + if (stats->n_values < 3) { + if (q1) + *q1 = 0.; + if (q2) + *q2 = 0.; + if (q3) + *q3 = 0.; + return; + } + + ret = igt_stats_get_median_internal(stats, 0, stats->n_values, + &lower_end, &upper_start); + if (q2) + *q2 = ret; + + ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL); + if (q1) + *q1 = ret; + + ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values, + NULL, NULL); + if (q3) + *q3 = ret; +} + +/** + * igt_stats_get_median: + * @stats: An #igt_stats_t instance + * + * Retrieves the median of the @stats dataset. + */ +double igt_stats_get_median(igt_stats_t *stats) +{ + return igt_stats_get_median_internal(stats, 0, stats->n_values, + NULL, NULL); +} + /* * Algorithm popularised by Knuth in: * diff --git a/lib/igt_stats.h b/lib/igt_stats.h index 46437c4..887fa79 100644 --- a/lib/igt_stats.h +++ b/lib/igt_stats.h @@ -41,8 +41,10 @@ typedef struct { unsigned int capacity; unsigned int is_population : 1; unsigned int mean_variance_valid : 1; + unsigned int sorted_array_valid : 1; uint64_t min, max; double mean, variance; + uint64_t *sorted; } igt_stats_t; void igt_stats_init(igt_stats_t *stats, unsigned int capacity); @@ -55,7 +57,10 @@ void igt_stats_push_array(igt_stats_t *stats, uint64_t igt_stats_get_min(igt_stats_t *stats); uint64_t igt_stats_get_max(igt_stats_t *stats); uint64_t igt_stats_get_range(igt_stats_t *stats); +void igt_stats_get_quartiles(igt_stats_t *stats, + double *q1, double *q2, double *q3); double igt_stats_get_mean(igt_stats_t *stats); +double igt_stats_get_median(igt_stats_t *stats); double igt_stats_get_variance(igt_stats_t *stats); double igt_stats_get_std_deviation(igt_stats_t *stats); diff --git a/lib/tests/igt_stats.c b/lib/tests/igt_stats.c index 468bba2..c4c6776 100644 --- a/lib/tests/igt_stats.c +++ b/lib/tests/igt_stats.c @@ -25,6 +25,8 @@ #include "igt_core.h" #include "igt_stats.h" +#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0])) + static void push_fixture_1(igt_stats_t *stats) { igt_stats_push(stats, 2); @@ -78,6 +80,44 @@ static void test_range(void) igt_assert(igt_stats_get_range(&stats) == 8); } +/* + * Examples taken from: https://en.wikipedia.org/wiki/Quartile + * The values are shifted a bit to test we do indeed start by sorting data + * set. + */ +static void test_quartiles(void) +{ + static const uint64_t s1[] = + { 47, 49, 6, 7, 15, 36, 39, 40, 41, 42, 43 }; + static const uint64_t s2[] = { 40, 41, 7, 15, 36, 39 }; + igt_stats_t stats; + double q1, q2, q3; + + /* s1, odd number of data points */ + igt_stats_init(&stats, ARRAY_SIZE(s1)); + igt_stats_push_array(&stats, s1, ARRAY_SIZE(s1)); + + igt_stats_get_quartiles(&stats, &q1, &q2, &q3); + igt_assert_eq_double(q1, 25.5); + igt_assert_eq_double(q2, 40); + igt_assert_eq_double(q3, 42.5); + igt_assert_eq_double(igt_stats_get_median(&stats), 40); + + igt_stats_fini(&stats); + + /* s1, even number of data points */ + igt_stats_init(&stats, ARRAY_SIZE(s2)); + igt_stats_push_array(&stats, s2, ARRAY_SIZE(s2)); + + igt_stats_get_quartiles(&stats, &q1, &q2, &q3); + igt_assert_eq_double(q1, 15); + igt_assert_eq_double(q2, 37.5); + igt_assert_eq_double(q3, 40); + igt_assert_eq_double(igt_stats_get_median(&stats), 37.5); + + igt_stats_fini(&stats); +} + static void test_mean(void) { igt_stats_t stats; @@ -150,6 +190,7 @@ igt_simple_main test_init(); test_min_max(); test_range(); + test_quartiles(); test_mean(); test_invalidate_mean(); test_std_deviation();
More stuff, quite useful characteristics of a dataset. Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> --- lib/igt_stats.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/igt_stats.h | 5 ++ lib/tests/igt_stats.c | 41 ++++++++++++++++ 3 files changed, 177 insertions(+)