diff mbox

[v6,08/15] qdist: add module to represent frequency distributions of data

Message ID 20160606234014.GA4418@flamenco (mailing list archive)
State New, archived
Headers show

Commit Message

Emilio Cota June 6, 2016, 11:40 p.m. UTC
On Fri, Jun 03, 2016 at 20:46:07 +0300, Sergey Fedorov wrote:
> On 03/06/16 20:29, Sergey Fedorov wrote:
> > On 03/06/16 20:22, Emilio G. Cota wrote:
> >> On Sat, May 28, 2016 at 21:15:06 +0300, Sergey Fedorov wrote:
> >>> On 25/05/16 04:13, Emilio G. Cota wrote:
> >>> (snip)
> >>>> +double qdist_avg(const struct qdist *dist)
> >>>> +{
> >>>> +    unsigned long count;
> >>>> +    size_t i;
> >>>> +    double ret = 0;
> >>>> +
> >>>> +    count = qdist_sample_count(dist);
> >>>> +    if (!count) {
> >>>> +        return NAN;
> >>>> +    }
> >>>> +    for (i = 0; i < dist->n; i++) {
> >>>> +        struct qdist_entry *e = &dist->entries[i];
> >>>> +
> >>>> +        ret += e->x * e->count / count;
> >>> Please use Welford’s method or something like that, see
> >>> http://stackoverflow.com/a/1346890.
> >> Yes, the way the mean is computed right now, we might suffer
> >> from underflow if count is huge. But I'd rather take that, than the
> >> perf penalty of an iterative method (such as the one used
> >> in Welford's). Note that we might have huge amounts of
> >> items, e.g. one item per head bucket in qht's occupancy qdist
> >> (and 0.5M head buckets is easy to achieve).
> >>
> >> If we were to use an iterative method, we'd need to do something
> >> like:
> >>
> >> double qdist_avg(const struct qdist *dist)
> >> {
> >>     size_t i, j;
> >>     double ret = 0;
> >>
> >>     if (!qdist_sample_count(dist)) {
> >>         return NAN;
> >>     }
> >>     /* compute moving average to prevent under/overflow */
> >>     for (i = 0; i < dist->n; i++) {
> >>         struct qdist_entry *e = &dist->entries[i];
> >>
> >>         for (j = 0; j < e->count; j++) {
> >>
> >>             ret += (e->x - ret) / (i + j + 1);
> >>         }
> >>     }
> >>     return ret;
> >> }
> >>
> >> Note that skipping the inner loop would be incorrect.
> > Ah, it's a shame. I'm wondering if there is some other algorithm that
> > could work for us?
> 
> Maybe something like
> https://en.wikipedia.org/wiki/Kahan_summation_algorithm could help?

That algorithm is overkill for what we're doing. Pairwise summation
should suffice:


		Emilio

Comments

Sergey Fedorov June 7, 2016, 2:06 p.m. UTC | #1
On 07/06/16 02:40, Emilio G. Cota wrote:
> On Fri, Jun 03, 2016 at 20:46:07 +0300, Sergey Fedorov wrote:
>> On 03/06/16 20:29, Sergey Fedorov wrote:
>>> On 03/06/16 20:22, Emilio G. Cota wrote:
>>>> On Sat, May 28, 2016 at 21:15:06 +0300, Sergey Fedorov wrote:
>>>>> On 25/05/16 04:13, Emilio G. Cota wrote:
>>>>> (snip)
>>>>>> +double qdist_avg(const struct qdist *dist)
>>>>>> +{
>>>>>> +    unsigned long count;
>>>>>> +    size_t i;
>>>>>> +    double ret = 0;
>>>>>> +
>>>>>> +    count = qdist_sample_count(dist);
>>>>>> +    if (!count) {
>>>>>> +        return NAN;
>>>>>> +    }
>>>>>> +    for (i = 0; i < dist->n; i++) {
>>>>>> +        struct qdist_entry *e = &dist->entries[i];
>>>>>> +
>>>>>> +        ret += e->x * e->count / count;
>>>>> Please use Welford’s method or something like that, see
>>>>> http://stackoverflow.com/a/1346890.
>>>> Yes, the way the mean is computed right now, we might suffer
>>>> from underflow if count is huge. But I'd rather take that, than the
>>>> perf penalty of an iterative method (such as the one used
>>>> in Welford's). Note that we might have huge amounts of
>>>> items, e.g. one item per head bucket in qht's occupancy qdist
>>>> (and 0.5M head buckets is easy to achieve).
>>>>
>>>> If we were to use an iterative method, we'd need to do something
>>>> like:
>>>>
>>>> double qdist_avg(const struct qdist *dist)
>>>> {
>>>>     size_t i, j;
>>>>     double ret = 0;
>>>>
>>>>     if (!qdist_sample_count(dist)) {
>>>>         return NAN;
>>>>     }
>>>>     /* compute moving average to prevent under/overflow */
>>>>     for (i = 0; i < dist->n; i++) {
>>>>         struct qdist_entry *e = &dist->entries[i];
>>>>
>>>>         for (j = 0; j < e->count; j++) {
>>>>
>>>>             ret += (e->x - ret) / (i + j + 1);
>>>>         }
>>>>     }
>>>>     return ret;
>>>> }
>>>>
>>>> Note that skipping the inner loop would be incorrect.
>>> Ah, it's a shame. I'm wondering if there is some other algorithm that
>>> could work for us?
>> Maybe something like
>> https://en.wikipedia.org/wiki/Kahan_summation_algorithm could help?
> That algorithm is overkill for what we're doing. Pairwise summation
> should suffice:
>
> diff --git a/util/qdist.c b/util/qdist.c
> index 3343640..909bd2b 100644
> --- a/util/qdist.c
> +++ b/util/qdist.c
> @@ -367,20 +367,34 @@ unsigned long qdist_sample_count(const struct qdist *dist)
>      return count;
>  }
>  
> +static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
> +                                 size_t n, unsigned long count)
> +{
> +    if (n <= 2) {

We would like to amortize the overhead of the recursion by making the
cut-off sufficiently large.

> +        size_t i;
> +        double ret = 0;
> +
> +        for (i = 0; i < n; i++) {
> +            struct qdist_entry *e = &dist->entries[index + i];
> +
> +            ret += e->x * e->count / count;
> +        }
> +        return ret;
> +    } else {
> +        size_t n2 = n / 2;
> +
> +        return qdist_pairwise_avg(dist, index, n2, count) +
> +               qdist_pairwise_avg(dist, index + n2, n - n2, count);
> +    }
> +}
> +
>  double qdist_avg(const struct qdist *dist)
>  {
>      unsigned long count;
> -    size_t i;
> -    double ret = 0;
>  
>      count = qdist_sample_count(dist);
>      if (!count) {
>          return NAN;
>      }
> -    for (i = 0; i < dist->n; i++) {
> -        struct qdist_entry *e = &dist->entries[i];
> -
> -        ret += e->x * e->count / count;
> -    }
> -    return ret;
> +    return qdist_pairwise_avg(dist, 0, dist->n, count);
>  }

Otherwise looks good.

Kind regards,
Sergey
Emilio Cota June 7, 2016, 10:53 p.m. UTC | #2
On Tue, Jun 07, 2016 at 17:06:16 +0300, Sergey Fedorov wrote:
> On 07/06/16 02:40, Emilio G. Cota wrote:
> > On Fri, Jun 03, 2016 at 20:46:07 +0300, Sergey Fedorov wrote:
> >> Maybe something like
> >> https://en.wikipedia.org/wiki/Kahan_summation_algorithm could help?
> > That algorithm is overkill for what we're doing. Pairwise summation
> > should suffice:
> >
> > diff --git a/util/qdist.c b/util/qdist.c
> > index 3343640..909bd2b 100644
> > --- a/util/qdist.c
> > +++ b/util/qdist.c
> > @@ -367,20 +367,34 @@ unsigned long qdist_sample_count(const struct qdist *dist)
> >      return count;
> >  }
> >  
> > +static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
> > +                                 size_t n, unsigned long count)
> > +{
> > +    if (n <= 2) {
> 
> We would like to amortize the overhead of the recursion by making the
> cut-off sufficiently large.

Yes, this was just for showing what it looked like.

We can use 128 here like JuliaLang does:
  https://github.com/JuliaLang/julia/blob/d98f2c0dcd/base/arraymath.jl#L366

(snip)
> Otherwise looks good.

Thanks!

		Emilio
Sergey Fedorov June 8, 2016, 1:09 p.m. UTC | #3
On 08/06/16 01:53, Emilio G. Cota wrote:
> On Tue, Jun 07, 2016 at 17:06:16 +0300, Sergey Fedorov wrote:
>> On 07/06/16 02:40, Emilio G. Cota wrote:
>>> On Fri, Jun 03, 2016 at 20:46:07 +0300, Sergey Fedorov wrote:
>>>> Maybe something like
>>>> https://en.wikipedia.org/wiki/Kahan_summation_algorithm could help?
>>> That algorithm is overkill for what we're doing. Pairwise summation
>>> should suffice:
>>>
>>> diff --git a/util/qdist.c b/util/qdist.c
>>> index 3343640..909bd2b 100644
>>> --- a/util/qdist.c
>>> +++ b/util/qdist.c
>>> @@ -367,20 +367,34 @@ unsigned long qdist_sample_count(const struct qdist *dist)
>>>      return count;
>>>  }
>>>  
>>> +static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
>>> +                                 size_t n, unsigned long count)
>>> +{
>>> +    if (n <= 2) {
>> We would like to amortize the overhead of the recursion by making the
>> cut-off sufficiently large.
> Yes, this was just for showing what it looked like.
>
> We can use 128 here like JuliaLang does:
>   https://github.com/JuliaLang/julia/blob/d98f2c0dcd/base/arraymath.jl#L366
>
>

Probably cut-off of ~10 items would be enough to amortize the recursion
in C.

Kind regards,
Sergey
diff mbox

Patch

diff --git a/util/qdist.c b/util/qdist.c
index 3343640..909bd2b 100644
--- a/util/qdist.c
+++ b/util/qdist.c
@@ -367,20 +367,34 @@  unsigned long qdist_sample_count(const struct qdist *dist)
     return count;
 }
 
+static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
+                                 size_t n, unsigned long count)
+{
+    if (n <= 2) {
+        size_t i;
+        double ret = 0;
+
+        for (i = 0; i < n; i++) {
+            struct qdist_entry *e = &dist->entries[index + i];
+
+            ret += e->x * e->count / count;
+        }
+        return ret;
+    } else {
+        size_t n2 = n / 2;
+
+        return qdist_pairwise_avg(dist, index, n2, count) +
+               qdist_pairwise_avg(dist, index + n2, n - n2, count);
+    }
+}
+
 double qdist_avg(const struct qdist *dist)
 {
     unsigned long count;
-    size_t i;
-    double ret = 0;
 
     count = qdist_sample_count(dist);
     if (!count) {
         return NAN;
     }
-    for (i = 0; i < dist->n; i++) {
-        struct qdist_entry *e = &dist->entries[i];
-
-        ret += e->x * e->count / count;
-    }
-    return ret;
+    return qdist_pairwise_avg(dist, 0, dist->n, count);
 }