diff mbox

[v2] Btrfs: heuristic replace heap sort with radix sort

Message ID 20171203213033.28258-1-nefelim4ag@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Timofey Titovets Dec. 3, 2017, 9:30 p.m. UTC
Slowest part of heuristic for now is kernel heap sort()
It's can take up to 55% of runtime on sorting bucket items.

As sorting will always call on most data sets to get correctly
byte_core_set_size, the only way to speed up heuristic, is to
speed up sort on bucket.

Add a general radix_sort function.
Radix sort require 2 buffers, one full size of input array
and one for store counters (jump addresses).

That increase usage per heuristic workspace +1KiB
8KiB + 1KiB -> 8KiB + 2KiB

That is LSD Radix, i use 4 bit as a base for calculating,
to make counters array acceptable small (16 elements * 8 byte).

That Radix sort implementation have several points to adjust,
I added him to make radix sort general usable in kernel,
like heap sort, if needed.

Performance tested in userspace copy of heuristic code,
throughput:
    - average <-> random data: ~3500 MiB/s - heap  sort
    - average <-> random data: ~6000 MiB/s - radix sort

Changes:
  v1 -> v2:
    - Tested on Big Endian
    - Drop most of multiply operations
    - Separately allocate sort buffer

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 fs/btrfs/compression.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 140 insertions(+), 7 deletions(-)

Comments

David Sterba Dec. 4, 2017, 8:36 p.m. UTC | #1
On Mon, Dec 04, 2017 at 12:30:33AM +0300, Timofey Titovets wrote:
> Slowest part of heuristic for now is kernel heap sort()
> It's can take up to 55% of runtime on sorting bucket items.
> 
> As sorting will always call on most data sets to get correctly
> byte_core_set_size, the only way to speed up heuristic, is to
> speed up sort on bucket.
> 
> Add a general radix_sort function.
> Radix sort require 2 buffers, one full size of input array
> and one for store counters (jump addresses).
> 
> That increase usage per heuristic workspace +1KiB
> 8KiB + 1KiB -> 8KiB + 2KiB

This is acceptable.

> That is LSD Radix, i use 4 bit as a base for calculating,
> to make counters array acceptable small (16 elements * 8 byte).
> 
> That Radix sort implementation have several points to adjust,
> I added him to make radix sort general usable in kernel,
> like heap sort, if needed.
> 
> Performance tested in userspace copy of heuristic code,
> throughput:
>     - average <-> random data: ~3500 MiB/s - heap  sort
>     - average <-> random data: ~6000 MiB/s - radix sort

Sounds good!

> Changes:
>   v1 -> v2:
>     - Tested on Big Endian
>     - Drop most of multiply operations
>     - Separately allocate sort buffer
> 
> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>

The code looks good, there are some coding style issues and we don't use
the uintX_t types, I'll fix that.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Dec. 4, 2017, 8:47 p.m. UTC | #2
On Mon, Dec 04, 2017 at 12:30:33AM +0300, Timofey Titovets wrote:
> Slowest part of heuristic for now is kernel heap sort()
> It's can take up to 55% of runtime on sorting bucket items.
> 
> As sorting will always call on most data sets to get correctly
> byte_core_set_size, the only way to speed up heuristic, is to
> speed up sort on bucket.
> 
> Add a general radix_sort function.
> Radix sort require 2 buffers, one full size of input array
> and one for store counters (jump addresses).
> 
> That increase usage per heuristic workspace +1KiB
> 8KiB + 1KiB -> 8KiB + 2KiB
> 
> That is LSD Radix, i use 4 bit as a base for calculating,
> to make counters array acceptable small (16 elements * 8 byte).
> 
> That Radix sort implementation have several points to adjust,
> I added him to make radix sort general usable in kernel,
> like heap sort, if needed.
> 
> Performance tested in userspace copy of heuristic code,
> throughput:
>     - average <-> random data: ~3500 MiB/s - heap  sort
>     - average <-> random data: ~6000 MiB/s - radix sort
> 
> Changes:
>   v1 -> v2:
>     - Tested on Big Endian
>     - Drop most of multiply operations
>     - Separately allocate sort buffer
> 
> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
> ---
>  fs/btrfs/compression.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 140 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index ae016699d13e..19b52982deda 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -33,7 +33,6 @@
>  #include <linux/bit_spinlock.h>
>  #include <linux/slab.h>
>  #include <linux/sched/mm.h>
> -#include <linux/sort.h>
>  #include <linux/log2.h>
>  #include "ctree.h"
>  #include "disk-io.h"
> @@ -752,6 +751,8 @@ struct heuristic_ws {
>  	u32 sample_size;
>  	/* Buckets store counters for each byte value */
>  	struct bucket_item *bucket;
> +	/* Sorting buffer */
> +	struct bucket_item *bucket_b;
>  	struct list_head list;
>  };
>  
> @@ -763,6 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
>  
>  	kvfree(workspace->sample);
>  	kfree(workspace->bucket);
> +	kfree(workspace->bucket_b);
>  	kfree(workspace);
>  }
>  
> @@ -782,6 +784,10 @@ static struct list_head *alloc_heuristic_ws(void)
>  	if (!ws->bucket)
>  		goto fail;
>  
> +	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
> +	if (!ws->bucket_b)
> +		goto fail;
> +
>  	INIT_LIST_HEAD(&ws->list);
>  	return &ws->list;
>  fail:
> @@ -1278,13 +1284,136 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
>  	return entropy_sum * 100 / entropy_max;
>  }
>  
> -/* Compare buckets by size, ascending */
> -static int bucket_comp_rev(const void *lv, const void *rv)
> +#define RADIX_BASE 4
> +#define COUNTERS_SIZE (1 << RADIX_BASE)
> +
> +static inline uint8_t get4bits(uint64_t num, int shift) {
> +	uint8_t low4bits;
> +	num = num >> shift;
> +	/* Reverse order */
> +	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
> +	return low4bits;
> +}
> +
> +static inline void copy_cell(void *dst, int dest_i, void *src, int src_i)
>  {
> -	const struct bucket_item *l = (const struct bucket_item *)lv;
> -	const struct bucket_item *r = (const struct bucket_item *)rv;
> +	struct bucket_item *dstv = (struct bucket_item *) dst;
> +	struct bucket_item *srcv = (struct bucket_item *) src;
> +	dstv[dest_i] = srcv[src_i];
> +}
>  
> -	return r->count - l->count;
> +static inline uint64_t get_num(const void *a, int i)
> +{
> +	struct bucket_item *av = (struct bucket_item *) a;
> +	return av[i].count;
> +}
> +
> +/*
> + * Use 4 bits as radix base
> + * Use 16 uint64_t counters for calculating new possition in buf array
> + *
> + * @array     - array that will be sorted
> + * @array_buf - buffer array to store sorting results
> + *              must be equal in size to @array
> + * @num       - array size
> + * @max_cell  - Link to element with maximum possible value
> + *              that can be used to cap radix sort iterations
> + *              if we know maximum value before call sort
> + * @get_num   - function to extract number from array
> + * @copy_cell - function to copy data from array to array_buf
> + *              and vise versa
> + * @get4bits  - function to get 4 bits from number at specified offset
> + */
> +
> +static void radix_sort(void *array, void *array_buf,
> +		       int num,
> +		       const void *max_cell,
> +		       uint64_t (*get_num)(const void *, int i),
> +		       void (*copy_cell)(void *dest, int dest_i,
> +					 void* src, int src_i),
> +		       uint8_t (*get4bits)(uint64_t num, int shift))

Do you have plans to use this function further? If not, passing the
callbacks (copy_cell and get4bits) is not necessary and can be
opencoded.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Timofey Titovets Dec. 4, 2017, 9:24 p.m. UTC | #3
2017-12-04 23:47 GMT+03:00 David Sterba <dsterba@suse.cz>:
> On Mon, Dec 04, 2017 at 12:30:33AM +0300, Timofey Titovets wrote:
>> Slowest part of heuristic for now is kernel heap sort()
>> It's can take up to 55% of runtime on sorting bucket items.
>>
>> As sorting will always call on most data sets to get correctly
>> byte_core_set_size, the only way to speed up heuristic, is to
>> speed up sort on bucket.
>>
>> Add a general radix_sort function.
>> Radix sort require 2 buffers, one full size of input array
>> and one for store counters (jump addresses).
>>
>> That increase usage per heuristic workspace +1KiB
>> 8KiB + 1KiB -> 8KiB + 2KiB
>>
>> That is LSD Radix, i use 4 bit as a base for calculating,
>> to make counters array acceptable small (16 elements * 8 byte).
>>
>> That Radix sort implementation have several points to adjust,
>> I added him to make radix sort general usable in kernel,
>> like heap sort, if needed.
>>
>> Performance tested in userspace copy of heuristic code,
>> throughput:
>>     - average <-> random data: ~3500 MiB/s - heap  sort
>>     - average <-> random data: ~6000 MiB/s - radix sort
>>
>> Changes:
>>   v1 -> v2:
>>     - Tested on Big Endian
>>     - Drop most of multiply operations
>>     - Separately allocate sort buffer
>>
>> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
>> ---
>>  fs/btrfs/compression.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 140 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
>> index ae016699d13e..19b52982deda 100644
>> --- a/fs/btrfs/compression.c
>> +++ b/fs/btrfs/compression.c
>> @@ -33,7 +33,6 @@
>>  #include <linux/bit_spinlock.h>
>>  #include <linux/slab.h>
>>  #include <linux/sched/mm.h>
>> -#include <linux/sort.h>
>>  #include <linux/log2.h>
>>  #include "ctree.h"
>>  #include "disk-io.h"
>> @@ -752,6 +751,8 @@ struct heuristic_ws {
>>       u32 sample_size;
>>       /* Buckets store counters for each byte value */
>>       struct bucket_item *bucket;
>> +     /* Sorting buffer */
>> +     struct bucket_item *bucket_b;
>>       struct list_head list;
>>  };
>>
>> @@ -763,6 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
>>
>>       kvfree(workspace->sample);
>>       kfree(workspace->bucket);
>> +     kfree(workspace->bucket_b);
>>       kfree(workspace);
>>  }
>>
>> @@ -782,6 +784,10 @@ static struct list_head *alloc_heuristic_ws(void)
>>       if (!ws->bucket)
>>               goto fail;
>>
>> +     ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
>> +     if (!ws->bucket_b)
>> +             goto fail;
>> +
>>       INIT_LIST_HEAD(&ws->list);
>>       return &ws->list;
>>  fail:
>> @@ -1278,13 +1284,136 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
>>       return entropy_sum * 100 / entropy_max;
>>  }
>>
>> -/* Compare buckets by size, ascending */
>> -static int bucket_comp_rev(const void *lv, const void *rv)
>> +#define RADIX_BASE 4
>> +#define COUNTERS_SIZE (1 << RADIX_BASE)
>> +
>> +static inline uint8_t get4bits(uint64_t num, int shift) {
>> +     uint8_t low4bits;
>> +     num = num >> shift;
>> +     /* Reverse order */
>> +     low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
>> +     return low4bits;
>> +}
>> +
>> +static inline void copy_cell(void *dst, int dest_i, void *src, int src_i)
>>  {
>> -     const struct bucket_item *l = (const struct bucket_item *)lv;
>> -     const struct bucket_item *r = (const struct bucket_item *)rv;
>> +     struct bucket_item *dstv = (struct bucket_item *) dst;
>> +     struct bucket_item *srcv = (struct bucket_item *) src;
>> +     dstv[dest_i] = srcv[src_i];
>> +}
>>
>> -     return r->count - l->count;
>> +static inline uint64_t get_num(const void *a, int i)
>> +{
>> +     struct bucket_item *av = (struct bucket_item *) a;
>> +     return av[i].count;
>> +}
>> +
>> +/*
>> + * Use 4 bits as radix base
>> + * Use 16 uint64_t counters for calculating new possition in buf array
>> + *
>> + * @array     - array that will be sorted
>> + * @array_buf - buffer array to store sorting results
>> + *              must be equal in size to @array
>> + * @num       - array size
>> + * @max_cell  - Link to element with maximum possible value
>> + *              that can be used to cap radix sort iterations
>> + *              if we know maximum value before call sort
>> + * @get_num   - function to extract number from array
>> + * @copy_cell - function to copy data from array to array_buf
>> + *              and vise versa
>> + * @get4bits  - function to get 4 bits from number at specified offset
>> + */
>> +
>> +static void radix_sort(void *array, void *array_buf,
>> +                    int num,
>> +                    const void *max_cell,
>> +                    uint64_t (*get_num)(const void *, int i),
>> +                    void (*copy_cell)(void *dest, int dest_i,
>> +                                      void* src, int src_i),
>> +                    uint8_t (*get4bits)(uint64_t num, int shift))
>
> Do you have plans to use this function further? If not, passing the
> callbacks (copy_cell and get4bits) is not necessary and can be
> opencoded.

(Sorry for uint64, i just copy paste it at late night, after testing
and forget to fix uint...)

I create that interface for radix_sort in assumption,
what that function can be useful in another place of the kernel,
so i try create that (may be not nice) flexible enough interface for
working with sort, like kernel have for heap sort.
i.e. create it input data agnostic.

But for now, i'm just not enough familiar with other kernel code,
to assume where that memory/speed trade-off can be usable/acceptable.
(ex. Valgrind call graph show 76 445 vs 22 191 for Heap vs Radix, for same data)
(CPU Cycle estimation, if i understood correctly)

So nope, i didn't have some "strong" plan, i just believe, what that
others can found that useful, and if that happen,
then that can be easy moved to some lib/radix_sort.{c,h}, have boot
test like sort() & etc.
(For same reason i leave "big" 64b counters, instead of use 32bit
(counter size restrict maximum size of input array to 2^64-1 for now).

As you have more experience at kernel hacking, i will follow your
advice at that, i.e. we can leave that "as is", in assumption "some
one can also reuse that later" or clean up it and optimize for that
specific case.

Thanks!
Timofey Titovets Dec. 5, 2017, 7:55 a.m. UTC | #4
2017-12-05 0:24 GMT+03:00 Timofey Titovets <nefelim4ag@gmail.com>:
> 2017-12-04 23:47 GMT+03:00 David Sterba <dsterba@suse.cz>:
>> On Mon, Dec 04, 2017 at 12:30:33AM +0300, Timofey Titovets wrote:
>>> Slowest part of heuristic for now is kernel heap sort()
>>> It's can take up to 55% of runtime on sorting bucket items.
>>>
>>> As sorting will always call on most data sets to get correctly
>>> byte_core_set_size, the only way to speed up heuristic, is to
>>> speed up sort on bucket.
>>>
>>> Add a general radix_sort function.
>>> Radix sort require 2 buffers, one full size of input array
>>> and one for store counters (jump addresses).
>>>
>>> That increase usage per heuristic workspace +1KiB
>>> 8KiB + 1KiB -> 8KiB + 2KiB
>>>
>>> That is LSD Radix, i use 4 bit as a base for calculating,
>>> to make counters array acceptable small (16 elements * 8 byte).
>>>
>>> That Radix sort implementation have several points to adjust,
>>> I added him to make radix sort general usable in kernel,
>>> like heap sort, if needed.
>>>
>>> Performance tested in userspace copy of heuristic code,
>>> throughput:
>>>     - average <-> random data: ~3500 MiB/s - heap  sort
>>>     - average <-> random data: ~6000 MiB/s - radix sort
>>>
>>> Changes:
>>>   v1 -> v2:
>>>     - Tested on Big Endian
>>>     - Drop most of multiply operations
>>>     - Separately allocate sort buffer
>>>
>>> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
>>> ---
>>>  fs/btrfs/compression.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>  1 file changed, 140 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
>>> index ae016699d13e..19b52982deda 100644
>>> --- a/fs/btrfs/compression.c
>>> +++ b/fs/btrfs/compression.c
>>> @@ -33,7 +33,6 @@
>>>  #include <linux/bit_spinlock.h>
>>>  #include <linux/slab.h>
>>>  #include <linux/sched/mm.h>
>>> -#include <linux/sort.h>
>>>  #include <linux/log2.h>
>>>  #include "ctree.h"
>>>  #include "disk-io.h"
>>> @@ -752,6 +751,8 @@ struct heuristic_ws {
>>>       u32 sample_size;
>>>       /* Buckets store counters for each byte value */
>>>       struct bucket_item *bucket;
>>> +     /* Sorting buffer */
>>> +     struct bucket_item *bucket_b;
>>>       struct list_head list;
>>>  };
>>>
>>> @@ -763,6 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
>>>
>>>       kvfree(workspace->sample);
>>>       kfree(workspace->bucket);
>>> +     kfree(workspace->bucket_b);
>>>       kfree(workspace);
>>>  }
>>>
>>> @@ -782,6 +784,10 @@ static struct list_head *alloc_heuristic_ws(void)
>>>       if (!ws->bucket)
>>>               goto fail;
>>>
>>> +     ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
>>> +     if (!ws->bucket_b)
>>> +             goto fail;
>>> +
>>>       INIT_LIST_HEAD(&ws->list);
>>>       return &ws->list;
>>>  fail:
>>> @@ -1278,13 +1284,136 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
>>>       return entropy_sum * 100 / entropy_max;
>>>  }
>>>
>>> -/* Compare buckets by size, ascending */
>>> -static int bucket_comp_rev(const void *lv, const void *rv)
>>> +#define RADIX_BASE 4
>>> +#define COUNTERS_SIZE (1 << RADIX_BASE)
>>> +
>>> +static inline uint8_t get4bits(uint64_t num, int shift) {
>>> +     uint8_t low4bits;
>>> +     num = num >> shift;
>>> +     /* Reverse order */
>>> +     low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
>>> +     return low4bits;
>>> +}
>>> +
>>> +static inline void copy_cell(void *dst, int dest_i, void *src, int src_i)
>>>  {
>>> -     const struct bucket_item *l = (const struct bucket_item *)lv;
>>> -     const struct bucket_item *r = (const struct bucket_item *)rv;
>>> +     struct bucket_item *dstv = (struct bucket_item *) dst;
>>> +     struct bucket_item *srcv = (struct bucket_item *) src;
>>> +     dstv[dest_i] = srcv[src_i];
>>> +}
>>>
>>> -     return r->count - l->count;
>>> +static inline uint64_t get_num(const void *a, int i)
>>> +{
>>> +     struct bucket_item *av = (struct bucket_item *) a;
>>> +     return av[i].count;
>>> +}
>>> +
>>> +/*
>>> + * Use 4 bits as radix base
>>> + * Use 16 uint64_t counters for calculating new possition in buf array
>>> + *
>>> + * @array     - array that will be sorted
>>> + * @array_buf - buffer array to store sorting results
>>> + *              must be equal in size to @array
>>> + * @num       - array size
>>> + * @max_cell  - Link to element with maximum possible value
>>> + *              that can be used to cap radix sort iterations
>>> + *              if we know maximum value before call sort
>>> + * @get_num   - function to extract number from array
>>> + * @copy_cell - function to copy data from array to array_buf
>>> + *              and vise versa
>>> + * @get4bits  - function to get 4 bits from number at specified offset
>>> + */
>>> +
>>> +static void radix_sort(void *array, void *array_buf,
>>> +                    int num,
>>> +                    const void *max_cell,
>>> +                    uint64_t (*get_num)(const void *, int i),
>>> +                    void (*copy_cell)(void *dest, int dest_i,
>>> +                                      void* src, int src_i),
>>> +                    uint8_t (*get4bits)(uint64_t num, int shift))
>>
>> Do you have plans to use this function further? If not, passing the
>> callbacks (copy_cell and get4bits) is not necessary and can be
>> opencoded.
>
> (Sorry for uint64, i just copy paste it at late night, after testing
> and forget to fix uint...)
>
> I create that interface for radix_sort in assumption,
> what that function can be useful in another place of the kernel,
> so i try create that (may be not nice) flexible enough interface for
> working with sort, like kernel have for heap sort.
> i.e. create it input data agnostic.
>
> But for now, i'm just not enough familiar with other kernel code,
> to assume where that memory/speed trade-off can be usable/acceptable.
> (ex. Valgrind call graph show 76 445 vs 22 191 for Heap vs Radix, for same data)
> (CPU Cycle estimation, if i understood correctly)
>
> So nope, i didn't have some "strong" plan, i just believe, what that
> others can found that useful, and if that happen,
> then that can be easy moved to some lib/radix_sort.{c,h}, have boot
> test like sort() & etc.
> (For same reason i leave "big" 64b counters, instead of use 32bit
> (counter size restrict maximum size of input array to 2^64-1 for now).
>
> As you have more experience at kernel hacking, i will follow your
> advice at that, i.e. we can leave that "as is", in assumption "some
> one can also reuse that later" or clean up it and optimize for that
> specific case.
>
> Thanks!
>
> --
> Have a nice day,
> Timofey.

I will send cleaned v3 version with style fixes & small cleanups,
I'm not really sure about opencode get4bits, copy_cell, get_num.
Because it's make radix sort more heavy to read.
i.e. that will not make code simple, that just add more copy paste,
and after compilation, that will not make any difference.

Thanks.
diff mbox

Patch

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ae016699d13e..19b52982deda 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,7 +33,6 @@ 
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
-#include <linux/sort.h>
 #include <linux/log2.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -752,6 +751,8 @@  struct heuristic_ws {
 	u32 sample_size;
 	/* Buckets store counters for each byte value */
 	struct bucket_item *bucket;
+	/* Sorting buffer */
+	struct bucket_item *bucket_b;
 	struct list_head list;
 };
 
@@ -763,6 +764,7 @@  static void free_heuristic_ws(struct list_head *ws)
 
 	kvfree(workspace->sample);
 	kfree(workspace->bucket);
+	kfree(workspace->bucket_b);
 	kfree(workspace);
 }
 
@@ -782,6 +784,10 @@  static struct list_head *alloc_heuristic_ws(void)
 	if (!ws->bucket)
 		goto fail;
 
+	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
+	if (!ws->bucket_b)
+		goto fail;
+
 	INIT_LIST_HEAD(&ws->list);
 	return &ws->list;
 fail:
@@ -1278,13 +1284,136 @@  static u32 shannon_entropy(struct heuristic_ws *ws)
 	return entropy_sum * 100 / entropy_max;
 }
 
-/* Compare buckets by size, ascending */
-static int bucket_comp_rev(const void *lv, const void *rv)
+#define RADIX_BASE 4
+#define COUNTERS_SIZE (1 << RADIX_BASE)
+
+static inline uint8_t get4bits(uint64_t num, int shift) {
+	uint8_t low4bits;
+	num = num >> shift;
+	/* Reverse order */
+	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
+	return low4bits;
+}
+
+static inline void copy_cell(void *dst, int dest_i, void *src, int src_i)
 {
-	const struct bucket_item *l = (const struct bucket_item *)lv;
-	const struct bucket_item *r = (const struct bucket_item *)rv;
+	struct bucket_item *dstv = (struct bucket_item *) dst;
+	struct bucket_item *srcv = (struct bucket_item *) src;
+	dstv[dest_i] = srcv[src_i];
+}
 
-	return r->count - l->count;
+static inline uint64_t get_num(const void *a, int i)
+{
+	struct bucket_item *av = (struct bucket_item *) a;
+	return av[i].count;
+}
+
+/*
+ * Use 4 bits as radix base
+ * Use 16 uint64_t counters for calculating new possition in buf array
+ *
+ * @array     - array that will be sorted
+ * @array_buf - buffer array to store sorting results
+ *              must be equal in size to @array
+ * @num       - array size
+ * @max_cell  - Link to element with maximum possible value
+ *              that can be used to cap radix sort iterations
+ *              if we know maximum value before call sort
+ * @get_num   - function to extract number from array
+ * @copy_cell - function to copy data from array to array_buf
+ *              and vise versa
+ * @get4bits  - function to get 4 bits from number at specified offset
+ */
+
+static void radix_sort(void *array, void *array_buf,
+		       int num,
+		       const void *max_cell,
+		       uint64_t (*get_num)(const void *, int i),
+		       void (*copy_cell)(void *dest, int dest_i,
+					 void* src, int src_i),
+		       uint8_t (*get4bits)(uint64_t num, int shift))
+{
+	u64 max_num;
+	uint64_t buf_num;
+	uint64_t counters[COUNTERS_SIZE];
+	uint64_t new_addr;
+	int i;
+	int addr;
+	int bitlen;
+	int shift;
+
+	/*
+	 * Try avoid useless loop iterations
+	 * For small numbers stored in big counters
+	 * example: 48 33 4 ... in 64bit array
+	 */
+	if (!max_cell) {
+		max_num = get_num(array, 0);
+		for (i = 1; i < num; i++) {
+			buf_num = get_num(array, i);
+			if (buf_num > max_num)
+				max_num = buf_num;
+		}
+	} else {
+		max_num = get_num(max_cell, 0);
+	}
+
+	buf_num = ilog2(max_num);
+	bitlen = ALIGN(buf_num, RADIX_BASE*2);
+
+	shift = 0;
+	while (shift < bitlen) {
+		memset(counters, 0, sizeof(counters));
+
+		for (i = 0; i < num; i++) {
+			buf_num = get_num(array, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]++;
+		}
+
+		for (i = 1; i < COUNTERS_SIZE; i++) {
+			counters[i] += counters[i-1];
+		}
+
+		for (i = (num - 1); i >= 0; i --) {
+			buf_num = get_num(array, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]--;
+			new_addr = counters[addr];
+			copy_cell(array_buf, new_addr, array, i);
+		}
+
+		shift += RADIX_BASE;
+
+		/*
+		 * For normal radix, that expected to
+		 * move data from tmp array, to main.
+		 * But that require some CPU time
+		 * Avoid that by doing another sort iteration
+		 * to origin array instead of memcpy()
+		 */
+		memset(counters, 0, sizeof(counters));
+
+		for (i = 0; i < num; i ++) {
+			buf_num = get_num(array_buf, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]++;
+		}
+
+		for (i = 1; i < COUNTERS_SIZE; i++) {
+			counters[i] += counters[i-1];
+		}
+
+		for (i = (num - 1); i >= 0; i--) {
+			buf_num = get_num(array_buf, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]--;
+			new_addr = counters[addr];
+			copy_cell(array, new_addr, array_buf, i);
+		}
+
+		shift += RADIX_BASE;
+	}
 }
 
 /*
@@ -1312,9 +1441,13 @@  static int byte_core_set_size(struct heuristic_ws *ws)
 	u32 coreset_sum = 0;
 	const u32 core_set_threshold = ws->sample_size * 90 / 100;
 	struct bucket_item *bucket = ws->bucket;
+	struct bucket_item max_cell;
 
 	/* Sort in reverse order */
-	sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+	max_cell.count = MAX_SAMPLE_SIZE;
+	radix_sort(ws->bucket, ws->bucket_b,
+		   BUCKET_SIZE, &max_cell,
+		   get_num, copy_cell, get4bits);
 
 	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
 		coreset_sum += bucket[i].count;