diff mbox

[1/3] random: replace non-blocking pool with a Chacha20-based CRNG

Message ID 1462170413-7164-2-git-send-email-tytso@mit.edu (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show

Commit Message

Theodore Ts'o May 2, 2016, 6:26 a.m. UTC
The CRNG is faster, and we don't pretend to track entropy usage in the
CRNG any more.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 crypto/chacha20_generic.c |  61 ----------
 drivers/char/random.c     | 282 ++++++++++++++++++++++++++++++++++------------
 include/crypto/chacha20.h |   1 +
 lib/Makefile              |   2 +-
 lib/chacha20.c            |  79 +++++++++++++
 5 files changed, 294 insertions(+), 131 deletions(-)
 create mode 100644 lib/chacha20.c

Comments

Stephan Mueller May 3, 2016, 8:50 a.m. UTC | #1
Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o:

Hi Theodore,

> The CRNG is faster, and we don't pretend to track entropy usage in the
> CRNG any more.

In general, I have no concerns with this approach either. And thank you that 
some of my concerns are addressed.

There are few more concerns left open. I would suggest I would write them up 
with a proposal on how to address them.

Some comments inlne:
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  crypto/chacha20_generic.c |  61 ----------
>  drivers/char/random.c     | 282
> ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h | 
>  1 +
>  lib/Makefile              |   2 +-
>  lib/chacha20.c            |  79 +++++++++++++
>  5 files changed, 294 insertions(+), 131 deletions(-)
>  create mode 100644 lib/chacha20.c
> 
> diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
> index da9c899..1cab831 100644
> --- a/crypto/chacha20_generic.c
> +++ b/crypto/chacha20_generic.c
> @@ -15,72 +15,11 @@
>  #include <linux/module.h>
>  #include <crypto/chacha20.h>
> 
> -static inline u32 rotl32(u32 v, u8 n)
> -{
> -	return (v << n) | (v >> (sizeof(v) * 8 - n));
> -}
> -
>  static inline u32 le32_to_cpuvp(const void *p)
>  {
>  	return le32_to_cpup(p);
>  }
> 
> -static void chacha20_block(u32 *state, void *stream)
> -{
> -	u32 x[16], *out = stream;
> -	int i;
> -
> -	for (i = 0; i < ARRAY_SIZE(x); i++)
> -		x[i] = state[i];
> -
> -	for (i = 0; i < 20; i += 2) {
> -		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
> -		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
> -		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
> -		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
> -
> -		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
> -		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
> -		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
> -		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
> -
> -		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
> -		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
> -		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
> -		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
> -
> -		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
> -		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
> -		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
> -		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
> -
> -		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
> -		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
> -		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
> -		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
> -
> -		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
> -		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
> -		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
> -		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
> -
> -		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
> -		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
> -		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
> -		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
> -
> -		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
> -		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
> -		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
> -		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
> -	}
> -
> -	for (i = 0; i < ARRAY_SIZE(x); i++)
> -		out[i] = cpu_to_le32(x[i] + state[i]);
> -
> -	state[12]++;
> -}
> -
>  static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
>  			     unsigned int bytes)
>  {
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index b583e53..95f4451 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -260,6 +260,7 @@
>  #include <linux/irq.h>
>  #include <linux/syscalls.h>
>  #include <linux/completion.h>
> +#include <crypto/chacha20.h>
> 
>  #include <asm/processor.h>
>  #include <asm/uaccess.h>
> @@ -412,6 +413,15 @@ static struct fasync_struct *fasync;
>  static DEFINE_SPINLOCK(random_ready_list_lock);
>  static LIST_HEAD(random_ready_list);
> 
> +/*
> + * crng_init =  0 --> Uninitialized
> + *		2 --> Initialized
> + *		3 --> Initialized from input_pool
> + */
> +static int crng_init = 0;

shouldn't that be an atomic_t ?

> +#define crng_ready() (likely(crng_init >= 2))
> +static void process_random_ready_list(void);
> +
>  /**********************************************************************
>   *
>   * OS independent entropy store.   Here are the functions which handle
> @@ -441,10 +451,13 @@ struct entropy_store {
>  	__u8 last_data[EXTRACT_SIZE];
>  };
> 
> +static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> +			       size_t nbytes, int min, int rsvd);
> +
> +static int crng_reseed(struct entropy_store *r);
>  static void push_to_pool(struct work_struct *work);
>  static __u32 input_pool_data[INPUT_POOL_WORDS];
>  static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
> -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
> 
>  static struct entropy_store input_pool = {
>  	.poolinfo = &poolinfo_table[0],
> @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = {
>  					push_to_pool),
>  };
> 
> -static struct entropy_store nonblocking_pool = {
> -	.poolinfo = &poolinfo_table[1],
> -	.name = "nonblocking",
> -	.pull = &input_pool,
> -	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
> -	.pool = nonblocking_pool_data,
> -	.push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
> -					push_to_pool),
> -};
> -
>  static __u32 const twist_table[8] = {
>  	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
>  	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
> @@ -677,12 +680,6 @@ retry:
>  	if (!r->initialized && r->entropy_total > 128) {
>  		r->initialized = 1;
>  		r->entropy_total = 0;
> -		if (r == &nonblocking_pool) {
> -			prandom_reseed_late();
> -			process_random_ready_list();
> -			wake_up_all(&urandom_init_wait);
> -			pr_notice("random: %s pool is initialized\n", r-
>name);
> -		}
>  	}
> 
>  	trace_credit_entropy_bits(r->name, nbits,
> @@ -692,30 +689,27 @@ retry:
>  	if (r == &input_pool) {
>  		int entropy_bits = entropy_count >> ENTROPY_SHIFT;
> 
> +		if (crng_init < 3 && entropy_bits >= 128) {
> +			(void) crng_reseed(r);
> +			entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
> +		}
> +
>  		/* should we wake readers? */
>  		if (entropy_bits >= random_read_wakeup_bits) {
>  			wake_up_interruptible(&random_read_wait);
>  			kill_fasync(&fasync, SIGIO, POLL_IN);
>  		}
>  		/* If the input pool is getting full, send some
> -		 * entropy to the two output pools, flipping back and
> -		 * forth between them, until the output pools are 75%
> -		 * full.
> +		 * entropy to the blocking pool until it is 75% full.
>  		 */
>  		if (entropy_bits > random_write_wakeup_bits &&
>  		    r->initialized &&
>  		    r->entropy_total >= 2*random_read_wakeup_bits) {
> -			static struct entropy_store *last = &blocking_pool;
>  			struct entropy_store *other = &blocking_pool;
> 
> -			if (last == &blocking_pool)
> -				other = &nonblocking_pool;
>  			if (other->entropy_count <=
> -			    3 * other->poolinfo->poolfracbits / 4)
> -				last = other;
> -			if (last->entropy_count <=
> -			    3 * last->poolinfo->poolfracbits / 4) {
> -				schedule_work(&last->push_work);
> +			    3 * other->poolinfo->poolfracbits / 4) {
> +				schedule_work(&other->push_work);
>  				r->entropy_total = 0;
>  			}
>  		}
> @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct
> entropy_store *r, int nbits)
> 
>  /*********************************************************************
>   *
> + * CRNG using CHACHA20
> + *
> + *********************************************************************/
> +
> +#define CRNG_RESEED_INTERVAL (300*HZ)
> +
> +struct crng_state {
> +	__u32		state[16];
> +	unsigned long	init_time;
> +	spinlock_t	lock;
> +};
> +
> +struct crng_state primary_crng = {
> +	.lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
> +};
> +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
> +
> +static void _initialize_crng(struct crng_state *crng)
> +{
> +	int		i;
> +	unsigned long	rv;

Why do you use unsigned long here? I thought the state[i] is unsigned int.
> +
> +	memcpy(&crng->state[0], "expand 32-byte k", 16);
> +	for (i = 4; i < 16; i++) {
> +		if (!arch_get_random_seed_long(&rv) &&
> +		    !arch_get_random_long(&rv))
> +			rv = random_get_entropy();
> +		crng->state[i] ^= rv;
> +	}
> +	crng->init_time = jiffies - CRNG_RESEED_INTERVAL;

Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to 
test that the ChaCha20 works?

> +}
> +
> +static void initialize_crng(struct crng_state *crng)
> +{
> +	_initialize_crng(crng);
> +	spin_lock_init(&crng->lock);
> +}
> +
> +static int crng_fast_load(__u32 pool[4])
> +{
> +	int	i;
> +	__u32	*p;
> +
> +	if (!spin_trylock(&primary_crng.lock))
> +		return 0;
> +	if (crng_ready()) {
> +		spin_unlock(&primary_crng.lock);
> +		return 0;
> +	}
> +	p = &primary_crng.state[4];
> +	if (crng_init == 1)
> +		p += 4;
> +	for (i=0; i < 4; i++)
> +		*p ^= pool[i];
> +	if (crng_init++ >= 2)
> +		wake_up_interruptible(&crng_init_wait);

Don't we have a race here with the crng_init < 3 check in crng_reseed 
considering multi-core systems?

> +	pr_notice("random: crng_init %d\n", crng_init);
> +	spin_unlock(&primary_crng.lock);
> +	return 1;
> +}
> +
> +/* Returns 1 on success */
> +static int crng_reseed(struct entropy_store *r)
> +{
> +	unsigned long	flags;
> +	int		ret = 0;
> +	int		i, num, num_words;
> +	__u32		tmp[16];
> +
> +	spin_lock_irqsave(&primary_crng.lock, flags);
> +	num = extract_entropy(r, tmp, 32, 16, 0);
> +	if (num == 0)
> +		goto out;
> +	if (num < 16 || num > 32) {
> +		WARN_ON(1);
> +		pr_err("crng_reseed: num is %d?!?\n", num);
> +	}
> +	num_words = (num + 3) / 4;
> +	for (i = 0; i < num_words; i++)
> +		primary_crng.state[i+4] ^= tmp[i];
> +	primary_crng.init_time = jiffies;
> +	if (crng_init < 3) {

Shouldn't that one be if (crng_init < 3 && num >= 16) ?

> +		crng_init = 3;
> +		process_random_ready_list();
> +		wake_up_interruptible(&crng_init_wait);
> +		pr_notice("random: crng_init 3\n");

Would it make sense to be more descriptive here to allow readers of dmesg to 
understand the output?

> +	}
> +	ret = 1;
> +out:
> +	spin_unlock_irqrestore(&primary_crng.lock, flags);

memzero_explicit of tmp?

> +	return ret;
> +}
> +
> +static inline void crng_wait_ready(void)
> +{
> +	wait_event_interruptible(crng_init_wait, crng_ready());
> +}
> +
> +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> +{
> +	unsigned long v, flags;
> +	struct crng_state *crng = &primary_crng;
> +
> +	if (crng_init > 2 &&
> +	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> +		crng_reseed(&input_pool);
> +	spin_lock_irqsave(&crng->lock, flags);
> +	if (arch_get_random_long(&v))
> +		crng->state[14] ^= v;

Again, unsigned int?

What is the purpose to only cover the 2nd 32 bit value of the nonce with 
arch_get_random?

> +	chacha20_block(&crng->state[0], out);
> +	if (crng->state[12] == 0)
> +		crng->state[13]++;

state[12]++? Or why do you increment the nonce?

> +	spin_unlock_irqrestore(&crng->lock, flags);
> +}
> +
> +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> +{
> +	ssize_t ret = 0, i;
> +	__u8 tmp[CHACHA20_BLOCK_SIZE];
> +	int large_request = (nbytes > 256);
> +
> +	while (nbytes) {
> +		if (large_request && need_resched()) {
> +			if (signal_pending(current)) {
> +				if (ret == 0)
> +					ret = -ERESTARTSYS;
> +				break;
> +			}
> +			schedule();
> +		}
> +
> +		extract_crng(tmp);

Now I have to wear my (ugly) FIPS heat: we need that code from the current 
implementation here:

                if (fips_enabled) {
                        spin_lock_irqsave(&r->lock, flags);
                        if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
                                panic("Hardware RNG duplicated output!\n");
                        memcpy(r->last_data, tmp, EXTRACT_SIZE);
                        spin_unlock_irqrestore(&r->lock, flags);
                }


> +		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> +		if (copy_to_user(buf, tmp, i)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		nbytes -= i;
> +		buf += i;
> +		ret += i;
> +	}
> +
> +	/* Wipe data just written to memory */
> +	memzero_explicit(tmp, sizeof(tmp));
> +
> +	return ret;
> +}
> +
> +
> +/*********************************************************************
> + *
>   * Entropy input management
>   *
>   *********************************************************************/
> @@ -749,12 +895,12 @@ struct timer_rand_state {
>  #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
> 
>  /*
> - * Add device- or boot-specific data to the input and nonblocking
> - * pools to help initialize them to unique values.
> + * Add device- or boot-specific data to the input pool to help
> + * initialize it.
>   *
> - * None of this adds any entropy, it is meant to avoid the
> - * problem of the nonblocking pool having similar initial state
> - * across largely identical devices.
> + * None of this adds any entropy; it is meant to avoid the problem of
> + * the entropy pool having similar initial state across largely
> + * identical devices.
>   */
>  void add_device_randomness(const void *buf, unsigned int size)
>  {
> @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned
> int size) _mix_pool_bytes(&input_pool, buf, size);
>  	_mix_pool_bytes(&input_pool, &time, sizeof(time));
>  	spin_unlock_irqrestore(&input_pool.lock, flags);
> -
> -	spin_lock_irqsave(&nonblocking_pool.lock, flags);
> -	_mix_pool_bytes(&nonblocking_pool, buf, size);
> -	_mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
> -	spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
>  }
>  EXPORT_SYMBOL(add_device_randomness);
> 
> @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state
> *state, unsigned num) sample.jiffies = jiffies;
>  	sample.cycles = random_get_entropy();
>  	sample.num = num;
> -	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> +	r = &input_pool;
>  	mix_pool_bytes(r, &sample, sizeof(sample));
> 
>  	/*
> @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags)
>  	    !time_after(now, fast_pool->last + HZ))
>  		return;
> 
> -	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> +	if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
> +		fast_pool->count = 0;
> +		fast_pool->last = now;
> +		return;
> +	}
> +
> +	r = &input_pool;
>  	if (!spin_trylock(&r->lock))
>  		return;
> 
> @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
>   *
>   *********************************************************************/
> 
> -static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> -			       size_t nbytes, int min, int rsvd);
> -
>  /*
>   * This utility inline function is responsible for transferring entropy
>   * from the primary pool to the secondary extraction pool. We make
> @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct
> entropy_store *r, void __user *buf, */
>  void get_random_bytes(void *buf, int nbytes)
>  {
> +	__u8 tmp[CHACHA20_BLOCK_SIZE];
> +
>  #if DEBUG_RANDOM_BOOT > 0
> -	if (unlikely(nonblocking_pool.initialized == 0))
> +	if (!crng_ready())
>  		printk(KERN_NOTICE "random: %pF get_random_bytes called "
> -		       "with %d bits of entropy available\n",
> -		       (void *) _RET_IP_,
> -		       nonblocking_pool.entropy_total);
> +		       "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
>  #endif
>  	trace_get_random_bytes(nbytes, _RET_IP_);
> -	extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
> +
> +	while (nbytes >= CHACHA20_BLOCK_SIZE) {
> +		extract_crng(buf);
> +		buf += CHACHA20_BLOCK_SIZE;
> +		nbytes -= CHACHA20_BLOCK_SIZE;
> +	}
> +
> +	if (nbytes > 0) {
> +		extract_crng(tmp);
> +		memcpy(buf, tmp, nbytes);
> +		memzero_explicit(tmp, nbytes);
> +	}
>  }
>  EXPORT_SYMBOL(get_random_bytes);
> 
> @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) unsigned long flags;
>  	int err = -EALREADY;
> 
> -	if (likely(nonblocking_pool.initialized))
> +	if (crng_ready())
>  		return err;
> 
>  	owner = rdy->owner;
> @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) return -ENOENT;
> 
>  	spin_lock_irqsave(&random_ready_list_lock, flags);
> -	if (nonblocking_pool.initialized)
> +	if (crng_ready())
>  		goto out;
> 
>  	owner = NULL;
> @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes)
>  	}
> 
>  	if (nbytes)
> -		extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
> +		get_random_bytes(p, nbytes);
>  }
>  EXPORT_SYMBOL(get_random_bytes_arch);
> 
> @@ -1395,7 +1550,7 @@ static int rand_initialize(void)
>  {
>  	init_std_data(&input_pool);
>  	init_std_data(&blocking_pool);
> -	init_std_data(&nonblocking_pool);
> +	_initialize_crng(&primary_crng);
>  	return 0;
>  }
>  early_initcall(rand_initialize);
> @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf,
> size_t nbytes, loff_t *ppos) {
>  	int ret;
> 
> -	if (unlikely(nonblocking_pool.initialized == 0))
> -		printk_once(KERN_NOTICE "random: %s urandom read "
> -			    "with %d bits of entropy available\n",
> -			    current->comm, nonblocking_pool.entropy_total);
> -
> +	crng_wait_ready();

Just for clarification: are you now blocking /dev/urandom until the CRNG is 
filled? That would be a big win.

>  	nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
> -	ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
> -
> -	trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
> -			   ENTROPY_BITS(&input_pool));
> +	ret = extract_crng_user(buf, nbytes);
> +	trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
>  	return ret;
>  }
> 
> @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const
> char __user *buffer, {
>  	size_t ret;
> 
> -	ret = write_pool(&blocking_pool, buffer, count);
> -	if (ret)
> -		return ret;
> -	ret = write_pool(&nonblocking_pool, buffer, count);
> +	ret = write_pool(&input_pool, buffer, count);
>  	if (ret)
>  		return ret;
> 
> @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int
> cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN))
>  			return -EPERM;
>  		input_pool.entropy_count = 0;
> -		nonblocking_pool.entropy_count = 0;
>  		blocking_pool.entropy_count = 0;
>  		return 0;
>  	default:
> @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf,
> size_t, count, if (flags & GRND_RANDOM)
>  		return _random_read(flags & GRND_NONBLOCK, buf, count);
> 
> -	if (unlikely(nonblocking_pool.initialized == 0)) {
> +	if (!crng_ready()) {
>  		if (flags & GRND_NONBLOCK)
>  			return -EAGAIN;
> -		wait_event_interruptible(urandom_init_wait,
> -					 nonblocking_pool.initialized);
> +		crng_wait_ready();
>  		if (signal_pending(current))
>  			return -ERESTARTSYS;
>  	}
> diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
> index 274bbae..20d20f68 100644
> --- a/include/crypto/chacha20.h
> +++ b/include/crypto/chacha20.h
> @@ -16,6 +16,7 @@ struct chacha20_ctx {
>  	u32 key[8];
>  };
> 
> +void chacha20_block(u32 *state, void *stream);
>  void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
>  int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
>  			   unsigned int keysize);
> diff --git a/lib/Makefile b/lib/Makefile
> index 7bd6fd4..9ba27cd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n
>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>  	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
>  	 idr.o int_sqrt.o extable.o \
> -	 sha1.o md5.o irq_regs.o argv_split.o \
> +	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
>  	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
>  	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
>  	 earlycpio.o seq_buf.o nmi_backtrace.o
> diff --git a/lib/chacha20.c b/lib/chacha20.c
> new file mode 100644
> index 0000000..250ceed
> --- /dev/null
> +++ b/lib/chacha20.c
> @@ -0,0 +1,79 @@
> +/*
> + * ChaCha20 256-bit cipher algorithm, RFC7539
> + *
> + * Copyright (C) 2015 Martin Willi
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/export.h>
> +#include <linux/bitops.h>
> +#include <linux/cryptohash.h>
> +#include <asm/unaligned.h>
> +#include <crypto/chacha20.h>
> +
> +static inline u32 rotl32(u32 v, u8 n)
> +{
> +	return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}
> +
> +extern void chacha20_block(u32 *state, void *stream)
> +{
> +	u32 x[16], *out = stream;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(x); i++)
> +		x[i] = state[i];
> +
> +	for (i = 0; i < 20; i += 2) {
> +		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
> +		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
> +		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
> +		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
> +
> +		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
> +		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
> +		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
> +		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
> +
> +		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
> +		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
> +		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
> +		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
> +
> +		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
> +		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
> +		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
> +		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
> +
> +		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
> +		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
> +		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
> +		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
> +
> +		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
> +		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
> +		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
> +		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
> +
> +		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
> +		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
> +		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
> +		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
> +
> +		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
> +		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
> +		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
> +		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(x); i++)
> +		out[i] = cpu_to_le32(x[i] + state[i]);
> +
> +	state[12]++;
> +}
> +EXPORT_SYMBOL(chacha20_block);


Ciao
Stephan
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephan Mueller May 3, 2016, 9:36 a.m. UTC | #2
Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o:

Hi Theodore,

One more item.

> The CRNG is faster, and we don't pretend to track entropy usage in the
> CRNG any more.
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  crypto/chacha20_generic.c |  61 ----------
>  drivers/char/random.c     | 282
> ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h | 
>  1 +
>  lib/Makefile              |   2 +-
>  lib/chacha20.c            |  79 +++++++++++++
>  5 files changed, 294 insertions(+), 131 deletions(-)
>  create mode 100644 lib/chacha20.c
> 
> diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
> index da9c899..1cab831 100644
> --- a/crypto/chacha20_generic.c
> +++ b/crypto/chacha20_generic.c
> @@ -15,72 +15,11 @@
>  #include <linux/module.h>
>  #include <crypto/chacha20.h>
> 
> -static inline u32 rotl32(u32 v, u8 n)
> -{
> -	return (v << n) | (v >> (sizeof(v) * 8 - n));
> -}
> -
>  static inline u32 le32_to_cpuvp(const void *p)
>  {
>  	return le32_to_cpup(p);
>  }
> 
> -static void chacha20_block(u32 *state, void *stream)
> -{
> -	u32 x[16], *out = stream;
> -	int i;
> -
> -	for (i = 0; i < ARRAY_SIZE(x); i++)
> -		x[i] = state[i];
> -
> -	for (i = 0; i < 20; i += 2) {
> -		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
> -		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
> -		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
> -		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
> -
> -		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
> -		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
> -		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
> -		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
> -
> -		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
> -		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
> -		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
> -		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
> -
> -		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
> -		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
> -		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
> -		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
> -
> -		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
> -		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
> -		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
> -		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
> -
> -		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
> -		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
> -		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
> -		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
> -
> -		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
> -		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
> -		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
> -		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
> -
> -		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
> -		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
> -		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
> -		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
> -	}
> -
> -	for (i = 0; i < ARRAY_SIZE(x); i++)
> -		out[i] = cpu_to_le32(x[i] + state[i]);
> -
> -	state[12]++;
> -}
> -
>  static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
>  			     unsigned int bytes)
>  {
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index b583e53..95f4451 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -260,6 +260,7 @@
>  #include <linux/irq.h>
>  #include <linux/syscalls.h>
>  #include <linux/completion.h>
> +#include <crypto/chacha20.h>
> 
>  #include <asm/processor.h>
>  #include <asm/uaccess.h>
> @@ -412,6 +413,15 @@ static struct fasync_struct *fasync;
>  static DEFINE_SPINLOCK(random_ready_list_lock);
>  static LIST_HEAD(random_ready_list);
> 
> +/*
> + * crng_init =  0 --> Uninitialized
> + *		2 --> Initialized
> + *		3 --> Initialized from input_pool
> + */
> +static int crng_init = 0;
> +#define crng_ready() (likely(crng_init >= 2))
> +static void process_random_ready_list(void);
> +
>  /**********************************************************************
>   *
>   * OS independent entropy store.   Here are the functions which handle
> @@ -441,10 +451,13 @@ struct entropy_store {
>  	__u8 last_data[EXTRACT_SIZE];
>  };
> 
> +static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> +			       size_t nbytes, int min, int rsvd);
> +
> +static int crng_reseed(struct entropy_store *r);
>  static void push_to_pool(struct work_struct *work);
>  static __u32 input_pool_data[INPUT_POOL_WORDS];
>  static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
> -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
> 
>  static struct entropy_store input_pool = {
>  	.poolinfo = &poolinfo_table[0],
> @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = {
>  					push_to_pool),
>  };
> 
> -static struct entropy_store nonblocking_pool = {
> -	.poolinfo = &poolinfo_table[1],
> -	.name = "nonblocking",
> -	.pull = &input_pool,
> -	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
> -	.pool = nonblocking_pool_data,
> -	.push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
> -					push_to_pool),
> -};
> -
>  static __u32 const twist_table[8] = {
>  	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
>  	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
> @@ -677,12 +680,6 @@ retry:
>  	if (!r->initialized && r->entropy_total > 128) {
>  		r->initialized = 1;
>  		r->entropy_total = 0;
> -		if (r == &nonblocking_pool) {
> -			prandom_reseed_late();
> -			process_random_ready_list();
> -			wake_up_all(&urandom_init_wait);
> -			pr_notice("random: %s pool is initialized\n", r-
>name);
> -		}
>  	}
> 
>  	trace_credit_entropy_bits(r->name, nbits,
> @@ -692,30 +689,27 @@ retry:
>  	if (r == &input_pool) {
>  		int entropy_bits = entropy_count >> ENTROPY_SHIFT;
> 
> +		if (crng_init < 3 && entropy_bits >= 128) {
> +			(void) crng_reseed(r);
> +			entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
> +		}
> +
>  		/* should we wake readers? */
>  		if (entropy_bits >= random_read_wakeup_bits) {
>  			wake_up_interruptible(&random_read_wait);
>  			kill_fasync(&fasync, SIGIO, POLL_IN);
>  		}
>  		/* If the input pool is getting full, send some
> -		 * entropy to the two output pools, flipping back and
> -		 * forth between them, until the output pools are 75%
> -		 * full.
> +		 * entropy to the blocking pool until it is 75% full.
>  		 */
>  		if (entropy_bits > random_write_wakeup_bits &&
>  		    r->initialized &&
>  		    r->entropy_total >= 2*random_read_wakeup_bits) {
> -			static struct entropy_store *last = &blocking_pool;
>  			struct entropy_store *other = &blocking_pool;
> 
> -			if (last == &blocking_pool)
> -				other = &nonblocking_pool;
>  			if (other->entropy_count <=
> -			    3 * other->poolinfo->poolfracbits / 4)
> -				last = other;
> -			if (last->entropy_count <=
> -			    3 * last->poolinfo->poolfracbits / 4) {
> -				schedule_work(&last->push_work);
> +			    3 * other->poolinfo->poolfracbits / 4) {
> +				schedule_work(&other->push_work);
>  				r->entropy_total = 0;
>  			}
>  		}
> @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct
> entropy_store *r, int nbits)
> 
>  /*********************************************************************
>   *
> + * CRNG using CHACHA20
> + *
> + *********************************************************************/
> +
> +#define CRNG_RESEED_INTERVAL (300*HZ)
> +
> +struct crng_state {
> +	__u32		state[16];
> +	unsigned long	init_time;
> +	spinlock_t	lock;
> +};
> +
> +struct crng_state primary_crng = {
> +	.lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
> +};
> +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
> +
> +static void _initialize_crng(struct crng_state *crng)
> +{
> +	int		i;
> +	unsigned long	rv;
> +
> +	memcpy(&crng->state[0], "expand 32-byte k", 16);
> +	for (i = 4; i < 16; i++) {
> +		if (!arch_get_random_seed_long(&rv) &&
> +		    !arch_get_random_long(&rv))
> +			rv = random_get_entropy();
> +		crng->state[i] ^= rv;
> +	}
> +	crng->init_time = jiffies - CRNG_RESEED_INTERVAL;
> +}
> +
> +static void initialize_crng(struct crng_state *crng)
> +{
> +	_initialize_crng(crng);
> +	spin_lock_init(&crng->lock);
> +}
> +
> +static int crng_fast_load(__u32 pool[4])
> +{
> +	int	i;
> +	__u32	*p;
> +
> +	if (!spin_trylock(&primary_crng.lock))
> +		return 0;
> +	if (crng_ready()) {
> +		spin_unlock(&primary_crng.lock);
> +		return 0;
> +	}
> +	p = &primary_crng.state[4];
> +	if (crng_init == 1)
> +		p += 4;
> +	for (i=0; i < 4; i++)
> +		*p ^= pool[i];
> +	if (crng_init++ >= 2)
> +		wake_up_interruptible(&crng_init_wait);
> +	pr_notice("random: crng_init %d\n", crng_init);
> +	spin_unlock(&primary_crng.lock);
> +	return 1;
> +}
> +
> +/* Returns 1 on success */
> +static int crng_reseed(struct entropy_store *r)
> +{
> +	unsigned long	flags;
> +	int		ret = 0;
> +	int		i, num, num_words;
> +	__u32		tmp[16];
> +
> +	spin_lock_irqsave(&primary_crng.lock, flags);
> +	num = extract_entropy(r, tmp, 32, 16, 0);
> +	if (num == 0)
> +		goto out;
> +	if (num < 16 || num > 32) {
> +		WARN_ON(1);
> +		pr_err("crng_reseed: num is %d?!?\n", num);
> +	}
> +	num_words = (num + 3) / 4;
> +	for (i = 0; i < num_words; i++)
> +		primary_crng.state[i+4] ^= tmp[i];
> +	primary_crng.init_time = jiffies;
> +	if (crng_init < 3) {
> +		crng_init = 3;
> +		process_random_ready_list();
> +		wake_up_interruptible(&crng_init_wait);
> +		pr_notice("random: crng_init 3\n");
> +	}
> +	ret = 1;
> +out:
> +	spin_unlock_irqrestore(&primary_crng.lock, flags);
> +	return ret;
> +}
> +
> +static inline void crng_wait_ready(void)
> +{
> +	wait_event_interruptible(crng_init_wait, crng_ready());
> +}
> +
> +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> +{
> +	unsigned long v, flags;
> +	struct crng_state *crng = &primary_crng;
> +
> +	if (crng_init > 2 &&
> +	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> +		crng_reseed(&input_pool);
> +	spin_lock_irqsave(&crng->lock, flags);
> +	if (arch_get_random_long(&v))
> +		crng->state[14] ^= v;
> +	chacha20_block(&crng->state[0], out);
> +	if (crng->state[12] == 0)
> +		crng->state[13]++;
> +	spin_unlock_irqrestore(&crng->lock, flags);
> +}
> +
> +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> +{
> +	ssize_t ret = 0, i;
> +	__u8 tmp[CHACHA20_BLOCK_SIZE];
> +	int large_request = (nbytes > 256);
> +
> +	while (nbytes) {
> +		if (large_request && need_resched()) {
> +			if (signal_pending(current)) {
> +				if (ret == 0)
> +					ret = -ERESTARTSYS;
> +				break;
> +			}
> +			schedule();
> +		}
> +
> +		extract_crng(tmp);
> +		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> +		if (copy_to_user(buf, tmp, i)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		nbytes -= i;
> +		buf += i;
> +		ret += i;
> +	}
> +
> +	/* Wipe data just written to memory */
> +	memzero_explicit(tmp, sizeof(tmp));


Would it make sense to add another chacha20_block() call here at the end? 
Note, the one thing about the SP800-90A DRBG I really like is the enhanced 
backward secrecy support which is implemented by "updating" the internal state 
(the key / state) used for one or more random number generation rounds after 
one request for random numbers is satisfied.

This means that even if the state becomes known or the subsequent caller 
manages to deduce the state of the RNG to some degree of confidence, he cannot 
backtrack the already generated random numbers.

I see that the ChaCha20 RNG implicitly updates its state while it operates. 
But for the last round of the RNG, there is no more shuffling of the internal 
state. As one round is 64 bytes in size and many callers just want 16 or 32 
bytes (as seen during testing), a lot of callers trigger only one round of the 
RNG.


> +
> +	return ret;
> +}
> +
> +
> +/*********************************************************************
> + *
>   * Entropy input management
>   *
>   *********************************************************************/
> @@ -749,12 +895,12 @@ struct timer_rand_state {
>  #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
> 
>  /*
> - * Add device- or boot-specific data to the input and nonblocking
> - * pools to help initialize them to unique values.
> + * Add device- or boot-specific data to the input pool to help
> + * initialize it.
>   *
> - * None of this adds any entropy, it is meant to avoid the
> - * problem of the nonblocking pool having similar initial state
> - * across largely identical devices.
> + * None of this adds any entropy; it is meant to avoid the problem of
> + * the entropy pool having similar initial state across largely
> + * identical devices.
>   */
>  void add_device_randomness(const void *buf, unsigned int size)
>  {
> @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned
> int size) _mix_pool_bytes(&input_pool, buf, size);
>  	_mix_pool_bytes(&input_pool, &time, sizeof(time));
>  	spin_unlock_irqrestore(&input_pool.lock, flags);
> -
> -	spin_lock_irqsave(&nonblocking_pool.lock, flags);
> -	_mix_pool_bytes(&nonblocking_pool, buf, size);
> -	_mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
> -	spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
>  }
>  EXPORT_SYMBOL(add_device_randomness);
> 
> @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state
> *state, unsigned num) sample.jiffies = jiffies;
>  	sample.cycles = random_get_entropy();
>  	sample.num = num;
> -	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> +	r = &input_pool;
>  	mix_pool_bytes(r, &sample, sizeof(sample));
> 
>  	/*
> @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags)
>  	    !time_after(now, fast_pool->last + HZ))
>  		return;
> 
> -	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> +	if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
> +		fast_pool->count = 0;
> +		fast_pool->last = now;
> +		return;
> +	}
> +
> +	r = &input_pool;
>  	if (!spin_trylock(&r->lock))
>  		return;
> 
> @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
>   *
>   *********************************************************************/
> 
> -static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> -			       size_t nbytes, int min, int rsvd);
> -
>  /*
>   * This utility inline function is responsible for transferring entropy
>   * from the primary pool to the secondary extraction pool. We make
> @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct
> entropy_store *r, void __user *buf, */
>  void get_random_bytes(void *buf, int nbytes)
>  {
> +	__u8 tmp[CHACHA20_BLOCK_SIZE];
> +
>  #if DEBUG_RANDOM_BOOT > 0
> -	if (unlikely(nonblocking_pool.initialized == 0))
> +	if (!crng_ready())
>  		printk(KERN_NOTICE "random: %pF get_random_bytes called "
> -		       "with %d bits of entropy available\n",
> -		       (void *) _RET_IP_,
> -		       nonblocking_pool.entropy_total);
> +		       "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
>  #endif
>  	trace_get_random_bytes(nbytes, _RET_IP_);
> -	extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
> +
> +	while (nbytes >= CHACHA20_BLOCK_SIZE) {
> +		extract_crng(buf);
> +		buf += CHACHA20_BLOCK_SIZE;
> +		nbytes -= CHACHA20_BLOCK_SIZE;
> +	}
> +
> +	if (nbytes > 0) {
> +		extract_crng(tmp);
> +		memcpy(buf, tmp, nbytes);
> +		memzero_explicit(tmp, nbytes);
> +	}

dto here.

>  }
>  EXPORT_SYMBOL(get_random_bytes);
> 
> @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) unsigned long flags;
>  	int err = -EALREADY;
> 
> -	if (likely(nonblocking_pool.initialized))
> +	if (crng_ready())
>  		return err;
> 
>  	owner = rdy->owner;
> @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) return -ENOENT;
> 
>  	spin_lock_irqsave(&random_ready_list_lock, flags);
> -	if (nonblocking_pool.initialized)
> +	if (crng_ready())
>  		goto out;
> 
>  	owner = NULL;
> @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes)
>  	}
> 
>  	if (nbytes)
> -		extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
> +		get_random_bytes(p, nbytes);
>  }
>  EXPORT_SYMBOL(get_random_bytes_arch);
> 
> @@ -1395,7 +1550,7 @@ static int rand_initialize(void)
>  {
>  	init_std_data(&input_pool);
>  	init_std_data(&blocking_pool);
> -	init_std_data(&nonblocking_pool);
> +	_initialize_crng(&primary_crng);
>  	return 0;
>  }
>  early_initcall(rand_initialize);
> @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf,
> size_t nbytes, loff_t *ppos) {
>  	int ret;
> 
> -	if (unlikely(nonblocking_pool.initialized == 0))
> -		printk_once(KERN_NOTICE "random: %s urandom read "
> -			    "with %d bits of entropy available\n",
> -			    current->comm, nonblocking_pool.entropy_total);
> -
> +	crng_wait_ready();
>  	nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
> -	ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
> -
> -	trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
> -			   ENTROPY_BITS(&input_pool));
> +	ret = extract_crng_user(buf, nbytes);
> +	trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
>  	return ret;
>  }
> 
> @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const
> char __user *buffer, {
>  	size_t ret;
> 
> -	ret = write_pool(&blocking_pool, buffer, count);
> -	if (ret)
> -		return ret;
> -	ret = write_pool(&nonblocking_pool, buffer, count);
> +	ret = write_pool(&input_pool, buffer, count);
>  	if (ret)
>  		return ret;
> 
> @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int
> cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN))
>  			return -EPERM;
>  		input_pool.entropy_count = 0;
> -		nonblocking_pool.entropy_count = 0;
>  		blocking_pool.entropy_count = 0;
>  		return 0;
>  	default:
> @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf,
> size_t, count, if (flags & GRND_RANDOM)
>  		return _random_read(flags & GRND_NONBLOCK, buf, count);
> 
> -	if (unlikely(nonblocking_pool.initialized == 0)) {
> +	if (!crng_ready()) {
>  		if (flags & GRND_NONBLOCK)
>  			return -EAGAIN;
> -		wait_event_interruptible(urandom_init_wait,
> -					 nonblocking_pool.initialized);
> +		crng_wait_ready();
>  		if (signal_pending(current))
>  			return -ERESTARTSYS;
>  	}
> diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
> index 274bbae..20d20f68 100644
> --- a/include/crypto/chacha20.h
> +++ b/include/crypto/chacha20.h
> @@ -16,6 +16,7 @@ struct chacha20_ctx {
>  	u32 key[8];
>  };
> 
> +void chacha20_block(u32 *state, void *stream);
>  void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
>  int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
>  			   unsigned int keysize);
> diff --git a/lib/Makefile b/lib/Makefile
> index 7bd6fd4..9ba27cd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n
>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>  	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
>  	 idr.o int_sqrt.o extable.o \
> -	 sha1.o md5.o irq_regs.o argv_split.o \
> +	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
>  	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
>  	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
>  	 earlycpio.o seq_buf.o nmi_backtrace.o
> diff --git a/lib/chacha20.c b/lib/chacha20.c
> new file mode 100644
> index 0000000..250ceed
> --- /dev/null
> +++ b/lib/chacha20.c
> @@ -0,0 +1,79 @@
> +/*
> + * ChaCha20 256-bit cipher algorithm, RFC7539
> + *
> + * Copyright (C) 2015 Martin Willi
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/export.h>
> +#include <linux/bitops.h>
> +#include <linux/cryptohash.h>
> +#include <asm/unaligned.h>
> +#include <crypto/chacha20.h>
> +
> +static inline u32 rotl32(u32 v, u8 n)
> +{
> +	return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}
> +
> +extern void chacha20_block(u32 *state, void *stream)
> +{
> +	u32 x[16], *out = stream;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(x); i++)
> +		x[i] = state[i];
> +
> +	for (i = 0; i < 20; i += 2) {
> +		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
> +		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
> +		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
> +		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
> +
> +		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
> +		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
> +		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
> +		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
> +
> +		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
> +		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
> +		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
> +		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
> +
> +		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
> +		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
> +		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
> +		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
> +
> +		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
> +		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
> +		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
> +		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
> +
> +		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
> +		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
> +		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
> +		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
> +
> +		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
> +		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
> +		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
> +		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
> +
> +		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
> +		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
> +		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
> +		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(x); i++)
> +		out[i] = cpu_to_le32(x[i] + state[i]);
> +
> +	state[12]++;
> +}
> +EXPORT_SYMBOL(chacha20_block);


Ciao
Stephan
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephan Mueller May 4, 2016, 6:24 a.m. UTC | #3
Am Dienstag, 3. Mai 2016, 11:36:12 schrieb Stephan Mueller:

Hi Ted,

> > +
> > +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> > +{
> > +	ssize_t ret = 0, i;
> > +	__u8 tmp[CHACHA20_BLOCK_SIZE];
> > +	int large_request = (nbytes > 256);
> > +
> > +	while (nbytes) {
> > +		if (large_request && need_resched()) {
> > +			if (signal_pending(current)) {
> > +				if (ret == 0)
> > +					ret = -ERESTARTSYS;
> > +				break;
> > +			}
> > +			schedule();
> > +		}
> > +
> > +		extract_crng(tmp);
> > +		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> > +		if (copy_to_user(buf, tmp, i)) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		nbytes -= i;
> > +		buf += i;
> > +		ret += i;
> > +	}
> > +
> > +	/* Wipe data just written to memory */
> > +	memzero_explicit(tmp, sizeof(tmp));
> 
> Would it make sense to add another chacha20_block() call here at the end?
> Note, the one thing about the SP800-90A DRBG I really like is the enhanced
> backward secrecy support which is implemented by "updating" the internal
> state (the key / state) used for one or more random number generation
> rounds after one request for random numbers is satisfied.
> 
> This means that even if the state becomes known or the subsequent caller
> manages to deduce the state of the RNG to some degree of confidence, he
> cannot backtrack the already generated random numbers.
> 
> I see that the ChaCha20 RNG implicitly updates its state while it operates.
> But for the last round of the RNG, there is no more shuffling of the
> internal state. As one round is 64 bytes in size and many callers just want
> 16 or 32 bytes (as seen during testing), a lot of callers trigger only one
> round of the RNG.

After doing some performance tests, I see that we reach a performance of north 
of 200 MB/s on my system (compare that to 12 MB/s for the SHA-1 version).

Thus, I would assume adding another call to chacha20_block should not hurt.

Ciao
Stephan
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeffrey Walton May 4, 2016, 2:40 p.m. UTC | #4
> +static inline u32 rotl32(u32 v, u8 n)
> +{
> +       return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}

That's undefined behavior when n=0.

I think the portable way to do a rotate that avoids UB is the
following. GCC, Clang and ICC recognize the pattern, and emit a rotate
instruction.

    static const unsigned int MASK=31;
    return (v<<n)|(v>>(-n&MASK));

You should also avoid the following because its not constant time due
to the branch:

    return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));

Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeffrey Walton May 4, 2016, 4:54 p.m. UTC | #5
>> +     chacha20_block(&crng->state[0], out);
>> +     if (crng->state[12] == 0)
>> +             crng->state[13]++;
>
> state[12]++? Or why do you increment the nonce?

In Bernstein's Salsa and ChaCha, the counter is 64-bit. It appears
ChaCha-TLS uses a 32-bit counter, and the other 32-bits is given to
the nonce.

Maybe the first question to ask is, what ChaCha is the kernel
providing? If its ChaCha-TLS, then the carry does not make a lot of
sense.

If the generator is limiting the amount of material under a given set
of security parameters (key and nonce), then the generator will likely
re-key itself long before the 256-GB induced wrap. In this case, it
does not matter which ChaCha the kernel is providing and the carry is
superfluous.

Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o May 4, 2016, 5:30 p.m. UTC | #6
On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote:
> > +/*
> > + * crng_init =  0 --> Uninitialized
> > + *		2 --> Initialized
> > + *		3 --> Initialized from input_pool
> > + */
> > +static int crng_init = 0;
> 
> shouldn't that be an atomic_t ?

The crng_init variable only gets incremented (it progresses from
0->1->2->3) and it's protected by the primary_crng->lock spinlock.
There are a few places where we read it without first taking the lock,
but that's where we depend on it getting incremented and where if we
race with another CPU which has just bumped the crng_init status, it's
not critical.  (Or we can take the lock and then recheck the crng_init
if we really need to be sure.)

> > +static void _initialize_crng(struct crng_state *crng)
> > +{
> > +	int		i;
> > +	unsigned long	rv;
> 
> Why do you use unsigned long here? I thought the state[i] is unsigned int.

Because it gets filled in via arch_get_random_seed_long(&rv) and
arch_get_random_log(&rv).  If that means we get 64 bits and we only
use 32 bits, that's OK....

> Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to 
> test that the ChaCha20 works?

We're not doing that for any of the other ciphers and hashes.  We
didn't do that for SHA1, and the chacha20 code where I took this from
didn't check for this as well.  What threat are you most worried
about.  We don't care about chacha20 being exactly chacha20, so long
as it's cryptographically strong.  In fact I think I removed a
potential host order byteswap in the set key operation specifically
because we don't care and interop.

If this is required for FIPS mode, we can add that later.  I was
primarily trying to keep the patch small to be easier for people to
look at it, so I've deliberately left off bells and whistles that
aren't strictly needed to show that the new approach is sound.

> > +	if (crng_init++ >= 2)
> > +		wake_up_interruptible(&crng_init_wait);
> 
> Don't we have a race here with the crng_init < 3 check in crng_reseed 
> considering multi-core systems?

No, because we are holding the primary_crng->lock spinlock.  I'll add
a comment explaining the locking protections which is intended for
crng_init where we declare it.


> > +	if (num < 16 || num > 32) {
> > +		WARN_ON(1);
> > +		pr_err("crng_reseed: num is %d?!?\n", num);
> > +	}
> > +	num_words = (num + 3) / 4;
> > +	for (i = 0; i < num_words; i++)
> > +		primary_crng.state[i+4] ^= tmp[i];
> > +	primary_crng.init_time = jiffies;
> > +	if (crng_init < 3) {
> 
> Shouldn't that one be if (crng_init < 3 && num >= 16) ?

I'll just change the above WRN_ON test to be:

     BUG_ON(num < 16 || num > 32);

This really can't happen, and I had it as a WARN_ON with a printk for
debugging purpose in case I was wrong about how the code works.

> > +		crng_init = 3;
> > +		process_random_ready_list();
> > +		wake_up_interruptible(&crng_init_wait);
> > +		pr_notice("random: crng_init 3\n");
> 
> Would it make sense to be more descriptive here to allow readers of dmesg to 
> understand the output?

Yes, what we're currently printing during the initialization:

random: crng_init 1
random: crng_init 2
random: crng_init 3

was again mostly for debugging purposes.  What I'm thinking about
doing is changing crng_init 2 and 3 messages to be:

random: crng fast init done
random: crng conservative init done

> > +	}
> > +	ret = 1;
> > +out:
> > +	spin_unlock_irqrestore(&primary_crng.lock, flags);
> 
> memzero_explicit of tmp?

Good point, I've added a call to memzero_explicit().

> > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> > +{
> > +	unsigned long v, flags;
> > +	struct crng_state *crng = &primary_crng;
> > +
> > +	if (crng_init > 2 &&
> > +	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> > +		crng_reseed(&input_pool);
> > +	spin_lock_irqsave(&crng->lock, flags);
> > +	if (arch_get_random_long(&v))
> > +		crng->state[14] ^= v;
> > +	chacha20_block(&crng->state[0], out);
> > +	if (crng->state[12] == 0)
> > +		crng->state[13]++;

> What is the purpose to only cover the 2nd 32 bit value of the nonce with 
> arch_get_random?
> 
> state[12]++? Or why do you increment the nonce?

In Chacha20, its output is a funcrtion of the state array, where
state[0..3] is a constant specified by the Chacha20 definition,
state[4..11] is the Key, and state[12..15] is the IV.  The
chacha20_block() function increments state[12] each time it is called,
so state[12] is being used as the block counter.  The increment of
state[13] is used to make state[13] to be the upper 32-bits of the
block counter.  We *should* be reseeding more often than 2**32 calls
to chacha20_block(), and the increment is just to be safe in case
something goes wronng and we're not reseeding.

We're using crng[14] to be contain the output of RDRAND, so this is
where we mix in the contribution from a CPU-level hwrng.  Previously
we called RDRAND multiple times and XOR'ed the results into the
output.  This is a bit faster and is more comforting for paranoiacs
who are concerned that Intel might have down something shifty enough
to be able to read from memory and change the output of RDRAND to
force a particular output from the RNG.

Ware using state[15] to contain the NUMA node id.  This was because
originally the used used the same key bytes (state[4..11]) between
different NUMA state arrays, so state[15] was used to guarantee that
state arrays would be different between different NUMA node state
arrays.  Now that we are deriving the key from a primary_crng, we
could use state[15] for something else, including as a second
destination from RDRAND.

> Now I have to wear my (ugly) FIPS heat: we need that code from the current 
> implementation here:
> 
>                 if (fips_enabled) {
>                         spin_lock_irqsave(&r->lock, flags);
>                         if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
>                                 panic("Hardware RNG duplicated output!\n");
>                         memcpy(r->last_data, tmp, EXTRACT_SIZE);
>                         spin_unlock_irqrestore(&r->lock, flags);
>                 }
> 

I'll add FIPS support as a separate patch.  I personally consider FIPS
support to be a distraction, and it's only useful for people who need
to sell to governments (mostly US governments).

> > -	if (unlikely(nonblocking_pool.initialized == 0))
> > -		printk_once(KERN_NOTICE "random: %s urandom read "
> > -			    "with %d bits of entropy available\n",
> > -			    current->comm, nonblocking_pool.entropy_total);
> > -
> > +	crng_wait_ready();
> 
> Just for clarification: are you now blocking /dev/urandom until the CRNG is 
> filled? That would be a big win.

Until the the fast init state, yes.  In practice we are blocking until
128 interrupts have occurred, which during boot is hapens quickly
enough that even on a simple KVM system, this happens before userspace
starts up.  There *is* a risk here, though.  Imagine a embedded
platform with very few interrupt-driven devices so device probing
isn't enough to make the CRNG ready when the initial ramdisk starts
up, and the init program tries to read from /dev/urandom --- which
then blocks, potentially indefinitely.

If that happens, then we will have broken userspace, and we may need
to revert this part of the change.  But I'm willing to take the risk,
in hopes that such super-simplisitic devices don't exist in real life,
and if they do, the IOT devices will probably be blithely ignoring
cryptographic concerns so much that they aren't using /dev/urandom
anyway.  :-)

>Would it make sense to add another chacha20_block() call here at the end?
>Note, the one thing about the SP800-90A DRBG I really like is the enhanced
>backward secrecy support which is implemented by "updating" the internal state
>(the key / state) used for one or more random number generation rounds after
>one request for random numbers is satisfied.
>
>This means that even if the state becomes known or the subsequent caller
>manages to deduce the state of the RNG to some degree of confidence, he cannot
>backtrack the already generated random numbers.

That's a good idea; being able to prevent back-tracking attacks is
a good thing.  I'll add this in the next version.

Cheers,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o May 4, 2016, 5:49 p.m. UTC | #7
On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
> > +static inline u32 rotl32(u32 v, u8 n)
> > +{
> > +       return (v << n) | (v >> (sizeof(v) * 8 - n));
> > +}
> 
> That's undefined behavior when n=0.

Sure, but it's never called with n = 0; I've double checked and the
compiler seems to do the right thing with the above pattern as well.

Hmm, it looks like there is a "standard" version rotate left and right
defined in include/linux/bitops.h.  So I suspect it would make sense
to use rol32 as defined in bitops.h --- and this is probably something
that we should do for the rest of crypto/*.c, where people seem to be
defininig their own version of something like rotl32 (I copied the
contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern
of defining one's own version of rol32 isn't new).

> I think the portable way to do a rotate that avoids UB is the
> following. GCC, Clang and ICC recognize the pattern, and emit a rotate
> instruction.
> 
>     static const unsigned int MASK=31;
>     return (v<<n)|(v>>(-n&MASK));
> 
> You should also avoid the following because its not constant time due
> to the branch:
> 
>     return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
> 

Where is this coming from?  I don't see this construct in the patch.

      	      	     	      	    - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin May 4, 2016, 5:52 p.m. UTC | #8
On May 4, 2016 10:30:41 AM PDT, tytso@mit.edu wrote:
>On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote:
>> > +/*
>> > + * crng_init =  0 --> Uninitialized
>> > + *		2 --> Initialized
>> > + *		3 --> Initialized from input_pool
>> > + */
>> > +static int crng_init = 0;
>> 
>> shouldn't that be an atomic_t ?
>
>The crng_init variable only gets incremented (it progresses from
>0->1->2->3) and it's protected by the primary_crng->lock spinlock.
>There are a few places where we read it without first taking the lock,
>but that's where we depend on it getting incremented and where if we
>race with another CPU which has just bumped the crng_init status, it's
>not critical.  (Or we can take the lock and then recheck the crng_init
>if we really need to be sure.)
>
>> > +static void _initialize_crng(struct crng_state *crng)
>> > +{
>> > +	int		i;
>> > +	unsigned long	rv;
>> 
>> Why do you use unsigned long here? I thought the state[i] is unsigned
>int.
>
>Because it gets filled in via arch_get_random_seed_long(&rv) and
>arch_get_random_log(&rv).  If that means we get 64 bits and we only
>use 32 bits, that's OK....
>
>> Would it make sense to add the ChaCha20 self test vectors from
>RFC7539 here to 
>> test that the ChaCha20 works?
>
>We're not doing that for any of the other ciphers and hashes.  We
>didn't do that for SHA1, and the chacha20 code where I took this from
>didn't check for this as well.  What threat are you most worried
>about.  We don't care about chacha20 being exactly chacha20, so long
>as it's cryptographically strong.  In fact I think I removed a
>potential host order byteswap in the set key operation specifically
>because we don't care and interop.
>
>If this is required for FIPS mode, we can add that later.  I was
>primarily trying to keep the patch small to be easier for people to
>look at it, so I've deliberately left off bells and whistles that
>aren't strictly needed to show that the new approach is sound.
>
>> > +	if (crng_init++ >= 2)
>> > +		wake_up_interruptible(&crng_init_wait);
>> 
>> Don't we have a race here with the crng_init < 3 check in crng_reseed
>
>> considering multi-core systems?
>
>No, because we are holding the primary_crng->lock spinlock.  I'll add
>a comment explaining the locking protections which is intended for
>crng_init where we declare it.
>
>
>> > +	if (num < 16 || num > 32) {
>> > +		WARN_ON(1);
>> > +		pr_err("crng_reseed: num is %d?!?\n", num);
>> > +	}
>> > +	num_words = (num + 3) / 4;
>> > +	for (i = 0; i < num_words; i++)
>> > +		primary_crng.state[i+4] ^= tmp[i];
>> > +	primary_crng.init_time = jiffies;
>> > +	if (crng_init < 3) {
>> 
>> Shouldn't that one be if (crng_init < 3 && num >= 16) ?
>
>I'll just change the above WRN_ON test to be:
>
>     BUG_ON(num < 16 || num > 32);
>
>This really can't happen, and I had it as a WARN_ON with a printk for
>debugging purpose in case I was wrong about how the code works.
>
>> > +		crng_init = 3;
>> > +		process_random_ready_list();
>> > +		wake_up_interruptible(&crng_init_wait);
>> > +		pr_notice("random: crng_init 3\n");
>> 
>> Would it make sense to be more descriptive here to allow readers of
>dmesg to 
>> understand the output?
>
>Yes, what we're currently printing during the initialization:
>
>random: crng_init 1
>random: crng_init 2
>random: crng_init 3
>
>was again mostly for debugging purposes.  What I'm thinking about
>doing is changing crng_init 2 and 3 messages to be:
>
>random: crng fast init done
>random: crng conservative init done
>
>> > +	}
>> > +	ret = 1;
>> > +out:
>> > +	spin_unlock_irqrestore(&primary_crng.lock, flags);
>> 
>> memzero_explicit of tmp?
>
>Good point, I've added a call to memzero_explicit().
>
>> > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
>> > +{
>> > +	unsigned long v, flags;
>> > +	struct crng_state *crng = &primary_crng;
>> > +
>> > +	if (crng_init > 2 &&
>> > +	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
>> > +		crng_reseed(&input_pool);
>> > +	spin_lock_irqsave(&crng->lock, flags);
>> > +	if (arch_get_random_long(&v))
>> > +		crng->state[14] ^= v;
>> > +	chacha20_block(&crng->state[0], out);
>> > +	if (crng->state[12] == 0)
>> > +		crng->state[13]++;
>
>> What is the purpose to only cover the 2nd 32 bit value of the nonce
>with 
>> arch_get_random?
>> 
>> state[12]++? Or why do you increment the nonce?
>
>In Chacha20, its output is a funcrtion of the state array, where
>state[0..3] is a constant specified by the Chacha20 definition,
>state[4..11] is the Key, and state[12..15] is the IV.  The
>chacha20_block() function increments state[12] each time it is called,
>so state[12] is being used as the block counter.  The increment of
>state[13] is used to make state[13] to be the upper 32-bits of the
>block counter.  We *should* be reseeding more often than 2**32 calls
>to chacha20_block(), and the increment is just to be safe in case
>something goes wronng and we're not reseeding.
>
>We're using crng[14] to be contain the output of RDRAND, so this is
>where we mix in the contribution from a CPU-level hwrng.  Previously
>we called RDRAND multiple times and XOR'ed the results into the
>output.  This is a bit faster and is more comforting for paranoiacs
>who are concerned that Intel might have down something shifty enough
>to be able to read from memory and change the output of RDRAND to
>force a particular output from the RNG.
>
>Ware using state[15] to contain the NUMA node id.  This was because
>originally the used used the same key bytes (state[4..11]) between
>different NUMA state arrays, so state[15] was used to guarantee that
>state arrays would be different between different NUMA node state
>arrays.  Now that we are deriving the key from a primary_crng, we
>could use state[15] for something else, including as a second
>destination from RDRAND.
>
>> Now I have to wear my (ugly) FIPS heat: we need that code from the
>current 
>> implementation here:
>> 
>>                 if (fips_enabled) {
>>                         spin_lock_irqsave(&r->lock, flags);
>>                         if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
>>                                 panic("Hardware RNG duplicated
>output!\n");
>>                         memcpy(r->last_data, tmp, EXTRACT_SIZE);
>>                         spin_unlock_irqrestore(&r->lock, flags);
>>                 }
>> 
>
>I'll add FIPS support as a separate patch.  I personally consider FIPS
>support to be a distraction, and it's only useful for people who need
>to sell to governments (mostly US governments).
>
>> > -	if (unlikely(nonblocking_pool.initialized == 0))
>> > -		printk_once(KERN_NOTICE "random: %s urandom read "
>> > -			    "with %d bits of entropy available\n",
>> > -			    current->comm, nonblocking_pool.entropy_total);
>> > -
>> > +	crng_wait_ready();
>> 
>> Just for clarification: are you now blocking /dev/urandom until the
>CRNG is 
>> filled? That would be a big win.
>
>Until the the fast init state, yes.  In practice we are blocking until
>128 interrupts have occurred, which during boot is hapens quickly
>enough that even on a simple KVM system, this happens before userspace
>starts up.  There *is* a risk here, though.  Imagine a embedded
>platform with very few interrupt-driven devices so device probing
>isn't enough to make the CRNG ready when the initial ramdisk starts
>up, and the init program tries to read from /dev/urandom --- which
>then blocks, potentially indefinitely.
>
>If that happens, then we will have broken userspace, and we may need
>to revert this part of the change.  But I'm willing to take the risk,
>in hopes that such super-simplisitic devices don't exist in real life,
>and if they do, the IOT devices will probably be blithely ignoring
>cryptographic concerns so much that they aren't using /dev/urandom
>anyway.  :-)
>
>>Would it make sense to add another chacha20_block() call here at the
>end?
>>Note, the one thing about the SP800-90A DRBG I really like is the
>enhanced
>>backward secrecy support which is implemented by "updating" the
>internal state
>>(the key / state) used for one or more random number generation rounds
>after
>>one request for random numbers is satisfied.
>>
>>This means that even if the state becomes known or the subsequent
>caller
>>manages to deduce the state of the RNG to some degree of confidence,
>he cannot
>>backtrack the already generated random numbers.
>
>That's a good idea; being able to prevent back-tracking attacks is
>a good thing.  I'll add this in the next version.
>
>Cheers,
>
>						- Ted

Why not use arch_get_random*_int()
Jeffrey Walton May 4, 2016, 6:22 p.m. UTC | #9
On Wed, May 4, 2016 at 1:49 PM,  <tytso@mit.edu> wrote:
> On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
>> > +static inline u32 rotl32(u32 v, u8 n)
>> > +{
>> > +       return (v << n) | (v >> (sizeof(v) * 8 - n));
>> > +}
>>
>> That's undefined behavior when n=0.
>
> Sure, but it's never called with n = 0; I've double checked and the
> compiler seems to do the right thing with the above pattern as well.

> Hmm, it looks like there is a "standard" version rotate left and right
> defined in include/linux/bitops.h.  So I suspect it would make sense
> to use rol32 as defined in bitops.h --- and this is probably something

bitops.h could work in this case, but its not an ideal solution. GCC
does not optimize the code below as expected under all use cases
because GCC does not recognize it as a rotate (see
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157):

    return (v << n) | (v >> (sizeof(v) * 8 - n));

And outside of special cases like Salsa, ChaCha and BLAKE2, the code
provided in bitops.h suffers UB on arbitrary data. So I think care
needs to be taken when selecting functions from bitops.h.

> that we should do for the rest of crypto/*.c, where people seem to be
> defininig their own version of something like rotl32 (I copied the
> contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern
> of defining one's own version of rol32 isn't new).

Yeah, I kind of thought there was some duplication going on.

But I think bitops.h should be fixed. Many folks don't realize the
lurking UB, and many folks don't realize its not always optimized
well.

>> I think the portable way to do a rotate that avoids UB is the
>> following. GCC, Clang and ICC recognize the pattern, and emit a rotate
>> instruction.
>>
>>     static const unsigned int MASK=31;
>>     return (v<<n)|(v>>(-n&MASK));
>>
>> You should also avoid the following because its not constant time due
>> to the branch:
>>
>>     return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
>>
>
> Where is this coming from?  I don't see this construct in the patch.

My bad... It was a general observation. I've seen folks try to correct
the UB by turning to something like that.

Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin May 4, 2016, 6:29 p.m. UTC | #10
On May 4, 2016 11:22:25 AM PDT, Jeffrey Walton <noloader@gmail.com> wrote:
>On Wed, May 4, 2016 at 1:49 PM,  <tytso@mit.edu> wrote:
>> On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
>>> > +static inline u32 rotl32(u32 v, u8 n)
>>> > +{
>>> > +       return (v << n) | (v >> (sizeof(v) * 8 - n));
>>> > +}
>>>
>>> That's undefined behavior when n=0.
>>
>> Sure, but it's never called with n = 0; I've double checked and the
>> compiler seems to do the right thing with the above pattern as well.
>
>> Hmm, it looks like there is a "standard" version rotate left and
>right
>> defined in include/linux/bitops.h.  So I suspect it would make sense
>> to use rol32 as defined in bitops.h --- and this is probably
>something
>
>bitops.h could work in this case, but its not an ideal solution. GCC
>does not optimize the code below as expected under all use cases
>because GCC does not recognize it as a rotate (see
>http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157):
>
>    return (v << n) | (v >> (sizeof(v) * 8 - n));
>
>And outside of special cases like Salsa, ChaCha and BLAKE2, the code
>provided in bitops.h suffers UB on arbitrary data. So I think care
>needs to be taken when selecting functions from bitops.h.
>
>> that we should do for the rest of crypto/*.c, where people seem to be
>> defininig their own version of something like rotl32 (I copied the
>> contents of crypto/chacha20_generic.c to lib/chacha20, so this
>pattern
>> of defining one's own version of rol32 isn't new).
>
>Yeah, I kind of thought there was some duplication going on.
>
>But I think bitops.h should be fixed. Many folks don't realize the
>lurking UB, and many folks don't realize its not always optimized
>well.
>
>>> I think the portable way to do a rotate that avoids UB is the
>>> following. GCC, Clang and ICC recognize the pattern, and emit a
>rotate
>>> instruction.
>>>
>>>     static const unsigned int MASK=31;
>>>     return (v<<n)|(v>>(-n&MASK));
>>>
>>> You should also avoid the following because its not constant time
>due
>>> to the branch:
>>>
>>>     return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
>>>
>>
>> Where is this coming from?  I don't see this construct in the patch.
>
>My bad... It was a general observation. I've seen folks try to correct
>the UB by turning to something like that.
>
>Jeff

We don't care about UB, we care about gcc, and to a lesser extent LLVM and ICC.  If bitops.h doesn't do the right thing, we need to fix bitops.h.
tytso@thunk.org May 4, 2016, 7:07 p.m. UTC | #11
On Wed, May 04, 2016 at 11:29:57AM -0700, H. Peter Anvin wrote:
> 
> We don't care about UB, we care about gcc, and to a lesser extent
> LLVM and ICC.  If bitops.h doesn't do the right thing, we need to
> fix bitops.h.

I'm going to suggest that we treat the ro[rl]{32,64}() question as
separable from the /dev/random case.  I've sanity checked gcc 5.3.1
and it does the right thing given the small number of constant
arguments given to rotl32() in lib/chacha20.c, and it doesn't hit the
UB case which Jeffrey was concerned about.  This is also code that was
previously in crypto/chacha20_generic.c, and so if there is a bug with
some obscure compiler not properly compiling it down to a rotate
instruction, (a) no one is paying me to make sure the kernel code
compiles well on ICC, and (b) it's not scalable to have each kernel
developer try to deal with the vagrancies of compilers.

So from my perspective, the only interesting results for me is (a)
using what had been used before with crypto/chacha20_generic.c, or (b)
reusing what is in bitops.h and letting it be someone else's problem
if some obscure compiler isn't happy with what is in bitops.h

If we are all agreed that what is in bitops.h is considered valid,
then we can start converting people over to using the version defined
in bitops.h, and if there is some compiler issue we need to work
around, at least we only need to put the workaround in a single header
file.

Cheers,
					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin May 4, 2016, 8:53 p.m. UTC | #12
On 05/04/2016 12:07 PM, tytso@thunk.org wrote:
> 
> If we are all agreed that what is in bitops.h is considered valid,
> then we can start converting people over to using the version defined
> in bitops.h, and if there is some compiler issue we need to work
> around, at least we only need to put the workaround in a single header
> file.
> 

Yes, please.

	-hpa


--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index da9c899..1cab831 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -15,72 +15,11 @@ 
 #include <linux/module.h>
 #include <crypto/chacha20.h>
 
-static inline u32 rotl32(u32 v, u8 n)
-{
-	return (v << n) | (v >> (sizeof(v) * 8 - n));
-}
-
 static inline u32 le32_to_cpuvp(const void *p)
 {
 	return le32_to_cpup(p);
 }
 
-static void chacha20_block(u32 *state, void *stream)
-{
-	u32 x[16], *out = stream;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		x[i] = state[i];
-
-	for (i = 0; i < 20; i += 2) {
-		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
-		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
-		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
-		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
-
-		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
-		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
-		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
-		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
-
-		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
-		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
-		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
-		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
-
-		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
-		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
-		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
-		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
-
-		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
-		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
-		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
-		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
-
-		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
-		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
-		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
-		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
-
-		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
-		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
-		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
-		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
-
-		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
-		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
-		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
-		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		out[i] = cpu_to_le32(x[i] + state[i]);
-
-	state[12]++;
-}
-
 static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
 			     unsigned int bytes)
 {
diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e53..95f4451 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@ 
 #include <linux/irq.h>
 #include <linux/syscalls.h>
 #include <linux/completion.h>
+#include <crypto/chacha20.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -412,6 +413,15 @@  static struct fasync_struct *fasync;
 static DEFINE_SPINLOCK(random_ready_list_lock);
 static LIST_HEAD(random_ready_list);
 
+/*
+ * crng_init =  0 --> Uninitialized
+ *		2 --> Initialized
+ *		3 --> Initialized from input_pool
+ */
+static int crng_init = 0;
+#define crng_ready() (likely(crng_init >= 2))
+static void process_random_ready_list(void);
+
 /**********************************************************************
  *
  * OS independent entropy store.   Here are the functions which handle
@@ -441,10 +451,13 @@  struct entropy_store {
 	__u8 last_data[EXTRACT_SIZE];
 };
 
+static ssize_t extract_entropy(struct entropy_store *r, void *buf,
+			       size_t nbytes, int min, int rsvd);
+
+static int crng_reseed(struct entropy_store *r);
 static void push_to_pool(struct work_struct *work);
 static __u32 input_pool_data[INPUT_POOL_WORDS];
 static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
-static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
 
 static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
@@ -465,16 +478,6 @@  static struct entropy_store blocking_pool = {
 					push_to_pool),
 };
 
-static struct entropy_store nonblocking_pool = {
-	.poolinfo = &poolinfo_table[1],
-	.name = "nonblocking",
-	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
-	.pool = nonblocking_pool_data,
-	.push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
-					push_to_pool),
-};
-
 static __u32 const twist_table[8] = {
 	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
@@ -677,12 +680,6 @@  retry:
 	if (!r->initialized && r->entropy_total > 128) {
 		r->initialized = 1;
 		r->entropy_total = 0;
-		if (r == &nonblocking_pool) {
-			prandom_reseed_late();
-			process_random_ready_list();
-			wake_up_all(&urandom_init_wait);
-			pr_notice("random: %s pool is initialized\n", r->name);
-		}
 	}
 
 	trace_credit_entropy_bits(r->name, nbits,
@@ -692,30 +689,27 @@  retry:
 	if (r == &input_pool) {
 		int entropy_bits = entropy_count >> ENTROPY_SHIFT;
 
+		if (crng_init < 3 && entropy_bits >= 128) {
+			(void) crng_reseed(r);
+			entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
+		}
+
 		/* should we wake readers? */
 		if (entropy_bits >= random_read_wakeup_bits) {
 			wake_up_interruptible(&random_read_wait);
 			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, send some
-		 * entropy to the two output pools, flipping back and
-		 * forth between them, until the output pools are 75%
-		 * full.
+		 * entropy to the blocking pool until it is 75% full.
 		 */
 		if (entropy_bits > random_write_wakeup_bits &&
 		    r->initialized &&
 		    r->entropy_total >= 2*random_read_wakeup_bits) {
-			static struct entropy_store *last = &blocking_pool;
 			struct entropy_store *other = &blocking_pool;
 
-			if (last == &blocking_pool)
-				other = &nonblocking_pool;
 			if (other->entropy_count <=
-			    3 * other->poolinfo->poolfracbits / 4)
-				last = other;
-			if (last->entropy_count <=
-			    3 * last->poolinfo->poolfracbits / 4) {
-				schedule_work(&last->push_work);
+			    3 * other->poolinfo->poolfracbits / 4) {
+				schedule_work(&other->push_work);
 				r->entropy_total = 0;
 			}
 		}
@@ -735,6 +729,158 @@  static void credit_entropy_bits_safe(struct entropy_store *r, int nbits)
 
 /*********************************************************************
  *
+ * CRNG using CHACHA20
+ *
+ *********************************************************************/
+
+#define CRNG_RESEED_INTERVAL (300*HZ)
+
+struct crng_state {
+	__u32		state[16];
+	unsigned long	init_time;
+	spinlock_t	lock;
+};
+
+struct crng_state primary_crng = {
+	.lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
+};
+static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
+
+static void _initialize_crng(struct crng_state *crng)
+{
+	int		i;
+	unsigned long	rv;
+
+	memcpy(&crng->state[0], "expand 32-byte k", 16);
+	for (i = 4; i < 16; i++) {
+		if (!arch_get_random_seed_long(&rv) &&
+		    !arch_get_random_long(&rv))
+			rv = random_get_entropy();
+		crng->state[i] ^= rv;
+	}
+	crng->init_time = jiffies - CRNG_RESEED_INTERVAL;
+}
+
+static void initialize_crng(struct crng_state *crng)
+{
+	_initialize_crng(crng);
+	spin_lock_init(&crng->lock);
+}
+
+static int crng_fast_load(__u32 pool[4])
+{
+	int	i;
+	__u32	*p;
+
+	if (!spin_trylock(&primary_crng.lock))
+		return 0;
+	if (crng_ready()) {
+		spin_unlock(&primary_crng.lock);
+		return 0;
+	}
+	p = &primary_crng.state[4];
+	if (crng_init == 1)
+		p += 4;
+	for (i=0; i < 4; i++)
+		*p ^= pool[i];
+	if (crng_init++ >= 2)
+		wake_up_interruptible(&crng_init_wait);
+	pr_notice("random: crng_init %d\n", crng_init);
+	spin_unlock(&primary_crng.lock);
+	return 1;
+}
+
+/* Returns 1 on success */
+static int crng_reseed(struct entropy_store *r)
+{
+	unsigned long	flags;
+	int		ret = 0;
+	int		i, num, num_words;
+	__u32		tmp[16];
+
+	spin_lock_irqsave(&primary_crng.lock, flags);
+	num = extract_entropy(r, tmp, 32, 16, 0);
+	if (num == 0)
+		goto out;
+	if (num < 16 || num > 32) {
+		WARN_ON(1);
+		pr_err("crng_reseed: num is %d?!?\n", num);
+	}
+	num_words = (num + 3) / 4;
+	for (i = 0; i < num_words; i++)
+		primary_crng.state[i+4] ^= tmp[i];
+	primary_crng.init_time = jiffies;
+	if (crng_init < 3) {
+		crng_init = 3;
+		process_random_ready_list();
+		wake_up_interruptible(&crng_init_wait);
+		pr_notice("random: crng_init 3\n");
+	}
+	ret = 1;
+out:
+	spin_unlock_irqrestore(&primary_crng.lock, flags);
+	return ret;
+}
+
+static inline void crng_wait_ready(void)
+{
+	wait_event_interruptible(crng_init_wait, crng_ready());
+}
+
+static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+{
+	unsigned long v, flags;
+	struct crng_state *crng = &primary_crng;
+
+	if (crng_init > 2 &&
+	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
+		crng_reseed(&input_pool);
+	spin_lock_irqsave(&crng->lock, flags);
+	if (arch_get_random_long(&v))
+		crng->state[14] ^= v;
+	chacha20_block(&crng->state[0], out);
+	if (crng->state[12] == 0)
+		crng->state[13]++;
+	spin_unlock_irqrestore(&crng->lock, flags);
+}
+
+static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
+{
+	ssize_t ret = 0, i;
+	__u8 tmp[CHACHA20_BLOCK_SIZE];
+	int large_request = (nbytes > 256);
+
+	while (nbytes) {
+		if (large_request && need_resched()) {
+			if (signal_pending(current)) {
+				if (ret == 0)
+					ret = -ERESTARTSYS;
+				break;
+			}
+			schedule();
+		}
+
+		extract_crng(tmp);
+		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
+		if (copy_to_user(buf, tmp, i)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		nbytes -= i;
+		buf += i;
+		ret += i;
+	}
+
+	/* Wipe data just written to memory */
+	memzero_explicit(tmp, sizeof(tmp));
+
+	return ret;
+}
+
+
+/*********************************************************************
+ *
  * Entropy input management
  *
  *********************************************************************/
@@ -749,12 +895,12 @@  struct timer_rand_state {
 #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
 
 /*
- * Add device- or boot-specific data to the input and nonblocking
- * pools to help initialize them to unique values.
+ * Add device- or boot-specific data to the input pool to help
+ * initialize it.
  *
- * None of this adds any entropy, it is meant to avoid the
- * problem of the nonblocking pool having similar initial state
- * across largely identical devices.
+ * None of this adds any entropy; it is meant to avoid the problem of
+ * the entropy pool having similar initial state across largely
+ * identical devices.
  */
 void add_device_randomness(const void *buf, unsigned int size)
 {
@@ -766,11 +912,6 @@  void add_device_randomness(const void *buf, unsigned int size)
 	_mix_pool_bytes(&input_pool, buf, size);
 	_mix_pool_bytes(&input_pool, &time, sizeof(time));
 	spin_unlock_irqrestore(&input_pool.lock, flags);
-
-	spin_lock_irqsave(&nonblocking_pool.lock, flags);
-	_mix_pool_bytes(&nonblocking_pool, buf, size);
-	_mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
-	spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
 }
 EXPORT_SYMBOL(add_device_randomness);
 
@@ -801,7 +942,7 @@  static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 	sample.jiffies = jiffies;
 	sample.cycles = random_get_entropy();
 	sample.num = num;
-	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+	r = &input_pool;
 	mix_pool_bytes(r, &sample, sizeof(sample));
 
 	/*
@@ -921,7 +1062,13 @@  void add_interrupt_randomness(int irq, int irq_flags)
 	    !time_after(now, fast_pool->last + HZ))
 		return;
 
-	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+	if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
+		fast_pool->count = 0;
+		fast_pool->last = now;
+		return;
+	}
+
+	r = &input_pool;
 	if (!spin_trylock(&r->lock))
 		return;
 
@@ -964,9 +1111,6 @@  EXPORT_SYMBOL_GPL(add_disk_randomness);
  *
  *********************************************************************/
 
-static ssize_t extract_entropy(struct entropy_store *r, void *buf,
-			       size_t nbytes, int min, int rsvd);
-
 /*
  * This utility inline function is responsible for transferring entropy
  * from the primary pool to the secondary extraction pool. We make
@@ -1252,15 +1396,26 @@  static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf,
  */
 void get_random_bytes(void *buf, int nbytes)
 {
+	__u8 tmp[CHACHA20_BLOCK_SIZE];
+
 #if DEBUG_RANDOM_BOOT > 0
-	if (unlikely(nonblocking_pool.initialized == 0))
+	if (!crng_ready())
 		printk(KERN_NOTICE "random: %pF get_random_bytes called "
-		       "with %d bits of entropy available\n",
-		       (void *) _RET_IP_,
-		       nonblocking_pool.entropy_total);
+		       "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
 #endif
 	trace_get_random_bytes(nbytes, _RET_IP_);
-	extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
+
+	while (nbytes >= CHACHA20_BLOCK_SIZE) {
+		extract_crng(buf);
+		buf += CHACHA20_BLOCK_SIZE;
+		nbytes -= CHACHA20_BLOCK_SIZE;
+	}
+
+	if (nbytes > 0) {
+		extract_crng(tmp);
+		memcpy(buf, tmp, nbytes);
+		memzero_explicit(tmp, nbytes);
+	}
 }
 EXPORT_SYMBOL(get_random_bytes);
 
@@ -1278,7 +1433,7 @@  int add_random_ready_callback(struct random_ready_callback *rdy)
 	unsigned long flags;
 	int err = -EALREADY;
 
-	if (likely(nonblocking_pool.initialized))
+	if (crng_ready())
 		return err;
 
 	owner = rdy->owner;
@@ -1286,7 +1441,7 @@  int add_random_ready_callback(struct random_ready_callback *rdy)
 		return -ENOENT;
 
 	spin_lock_irqsave(&random_ready_list_lock, flags);
-	if (nonblocking_pool.initialized)
+	if (crng_ready())
 		goto out;
 
 	owner = NULL;
@@ -1350,7 +1505,7 @@  void get_random_bytes_arch(void *buf, int nbytes)
 	}
 
 	if (nbytes)
-		extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
+		get_random_bytes(p, nbytes);
 }
 EXPORT_SYMBOL(get_random_bytes_arch);
 
@@ -1395,7 +1550,7 @@  static int rand_initialize(void)
 {
 	init_std_data(&input_pool);
 	init_std_data(&blocking_pool);
-	init_std_data(&nonblocking_pool);
+	_initialize_crng(&primary_crng);
 	return 0;
 }
 early_initcall(rand_initialize);
@@ -1459,16 +1614,10 @@  urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
 	int ret;
 
-	if (unlikely(nonblocking_pool.initialized == 0))
-		printk_once(KERN_NOTICE "random: %s urandom read "
-			    "with %d bits of entropy available\n",
-			    current->comm, nonblocking_pool.entropy_total);
-
+	crng_wait_ready();
 	nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
-	ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
-
-	trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
-			   ENTROPY_BITS(&input_pool));
+	ret = extract_crng_user(buf, nbytes);
+	trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
 	return ret;
 }
 
@@ -1514,10 +1663,7 @@  static ssize_t random_write(struct file *file, const char __user *buffer,
 {
 	size_t ret;
 
-	ret = write_pool(&blocking_pool, buffer, count);
-	if (ret)
-		return ret;
-	ret = write_pool(&nonblocking_pool, buffer, count);
+	ret = write_pool(&input_pool, buffer, count);
 	if (ret)
 		return ret;
 
@@ -1568,7 +1714,6 @@  static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		input_pool.entropy_count = 0;
-		nonblocking_pool.entropy_count = 0;
 		blocking_pool.entropy_count = 0;
 		return 0;
 	default:
@@ -1610,11 +1755,10 @@  SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	if (flags & GRND_RANDOM)
 		return _random_read(flags & GRND_NONBLOCK, buf, count);
 
-	if (unlikely(nonblocking_pool.initialized == 0)) {
+	if (!crng_ready()) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
-		wait_event_interruptible(urandom_init_wait,
-					 nonblocking_pool.initialized);
+		crng_wait_ready();
 		if (signal_pending(current))
 			return -ERESTARTSYS;
 	}
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index 274bbae..20d20f68 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -16,6 +16,7 @@  struct chacha20_ctx {
 	u32 key[8];
 };
 
+void chacha20_block(u32 *state, void *stream);
 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
 int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
 			   unsigned int keysize);
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd4..9ba27cd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@  KCOV_INSTRUMENT_hweight.o := n
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
-	 sha1.o md5.o irq_regs.o argv_split.o \
+	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o nmi_backtrace.o
diff --git a/lib/chacha20.c b/lib/chacha20.c
new file mode 100644
index 0000000..250ceed
--- /dev/null
+++ b/lib/chacha20.c
@@ -0,0 +1,79 @@ 
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+#include <crypto/chacha20.h>
+
+static inline u32 rotl32(u32 v, u8 n)
+{
+	return (v << n) | (v >> (sizeof(v) * 8 - n));
+}
+
+extern void chacha20_block(u32 *state, void *stream)
+{
+	u32 x[16], *out = stream;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(x); i++)
+		x[i] = state[i];
+
+	for (i = 0; i < 20; i += 2) {
+		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],  16);
+		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],  16);
+		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],  16);
+		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],  16);
+
+		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],  12);
+		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],  12);
+		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10], 12);
+		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11], 12);
+
+		x[0]  += x[4];    x[12] = rotl32(x[12] ^ x[0],   8);
+		x[1]  += x[5];    x[13] = rotl32(x[13] ^ x[1],   8);
+		x[2]  += x[6];    x[14] = rotl32(x[14] ^ x[2],   8);
+		x[3]  += x[7];    x[15] = rotl32(x[15] ^ x[3],   8);
+
+		x[8]  += x[12];   x[4]  = rotl32(x[4]  ^ x[8],   7);
+		x[9]  += x[13];   x[5]  = rotl32(x[5]  ^ x[9],   7);
+		x[10] += x[14];   x[6]  = rotl32(x[6]  ^ x[10],  7);
+		x[11] += x[15];   x[7]  = rotl32(x[7]  ^ x[11],  7);
+
+		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],  16);
+		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],  16);
+		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],  16);
+		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],  16);
+
+		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10], 12);
+		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11], 12);
+		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],  12);
+		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],  12);
+
+		x[0]  += x[5];    x[15] = rotl32(x[15] ^ x[0],   8);
+		x[1]  += x[6];    x[12] = rotl32(x[12] ^ x[1],   8);
+		x[2]  += x[7];    x[13] = rotl32(x[13] ^ x[2],   8);
+		x[3]  += x[4];    x[14] = rotl32(x[14] ^ x[3],   8);
+
+		x[10] += x[15];   x[5]  = rotl32(x[5]  ^ x[10],  7);
+		x[11] += x[12];   x[6]  = rotl32(x[6]  ^ x[11],  7);
+		x[8]  += x[13];   x[7]  = rotl32(x[7]  ^ x[8],   7);
+		x[9]  += x[14];   x[4]  = rotl32(x[4]  ^ x[9],   7);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(x); i++)
+		out[i] = cpu_to_le32(x[i] + state[i]);
+
+	state[12]++;
+}
+EXPORT_SYMBOL(chacha20_block);