Message ID | 1462170413-7164-2-git-send-email-tytso@mit.edu (mailing list archive) |
---|---|
State | Not Applicable |
Delegated to: | Herbert Xu |
Headers | show |
Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o: Hi Theodore, > The CRNG is faster, and we don't pretend to track entropy usage in the > CRNG any more. In general, I have no concerns with this approach either. And thank you that some of my concerns are addressed. There are few more concerns left open. I would suggest I would write them up with a proposal on how to address them. Some comments inlne: > > Signed-off-by: Theodore Ts'o <tytso@mit.edu> > --- > crypto/chacha20_generic.c | 61 ---------- > drivers/char/random.c | 282 > ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h | > 1 + > lib/Makefile | 2 +- > lib/chacha20.c | 79 +++++++++++++ > 5 files changed, 294 insertions(+), 131 deletions(-) > create mode 100644 lib/chacha20.c > > diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c > index da9c899..1cab831 100644 > --- a/crypto/chacha20_generic.c > +++ b/crypto/chacha20_generic.c > @@ -15,72 +15,11 @@ > #include <linux/module.h> > #include <crypto/chacha20.h> > > -static inline u32 rotl32(u32 v, u8 n) > -{ > - return (v << n) | (v >> (sizeof(v) * 8 - n)); > -} > - > static inline u32 le32_to_cpuvp(const void *p) > { > return le32_to_cpup(p); > } > > -static void chacha20_block(u32 *state, void *stream) > -{ > - u32 x[16], *out = stream; > - int i; > - > - for (i = 0; i < ARRAY_SIZE(x); i++) > - x[i] = state[i]; > - > - for (i = 0; i < 20; i += 2) { > - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); > - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); > - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); > - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); > - > - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); > - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); > - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); > - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); > - > - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); > - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); > - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); > - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); > - > - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); > - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); > - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); > - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); > - > - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); > - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); > - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); > - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); > - > - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); > - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); > - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); > - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); > - > - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); > - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); > - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); > - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); > - > - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); > - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); > - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); > - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); > - } > - > - for (i = 0; i < ARRAY_SIZE(x); i++) > - out[i] = cpu_to_le32(x[i] + state[i]); > - > - state[12]++; > -} > - > static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, > unsigned int bytes) > { > diff --git a/drivers/char/random.c b/drivers/char/random.c > index b583e53..95f4451 100644 > --- a/drivers/char/random.c > +++ b/drivers/char/random.c > @@ -260,6 +260,7 @@ > #include <linux/irq.h> > #include <linux/syscalls.h> > #include <linux/completion.h> > +#include <crypto/chacha20.h> > > #include <asm/processor.h> > #include <asm/uaccess.h> > @@ -412,6 +413,15 @@ static struct fasync_struct *fasync; > static DEFINE_SPINLOCK(random_ready_list_lock); > static LIST_HEAD(random_ready_list); > > +/* > + * crng_init = 0 --> Uninitialized > + * 2 --> Initialized > + * 3 --> Initialized from input_pool > + */ > +static int crng_init = 0; shouldn't that be an atomic_t ? > +#define crng_ready() (likely(crng_init >= 2)) > +static void process_random_ready_list(void); > + > /********************************************************************** > * > * OS independent entropy store. Here are the functions which handle > @@ -441,10 +451,13 @@ struct entropy_store { > __u8 last_data[EXTRACT_SIZE]; > }; > > +static ssize_t extract_entropy(struct entropy_store *r, void *buf, > + size_t nbytes, int min, int rsvd); > + > +static int crng_reseed(struct entropy_store *r); > static void push_to_pool(struct work_struct *work); > static __u32 input_pool_data[INPUT_POOL_WORDS]; > static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; > -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS]; > > static struct entropy_store input_pool = { > .poolinfo = &poolinfo_table[0], > @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = { > push_to_pool), > }; > > -static struct entropy_store nonblocking_pool = { > - .poolinfo = &poolinfo_table[1], > - .name = "nonblocking", > - .pull = &input_pool, > - .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock), > - .pool = nonblocking_pool_data, > - .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work, > - push_to_pool), > -}; > - > static __u32 const twist_table[8] = { > 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, > 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; > @@ -677,12 +680,6 @@ retry: > if (!r->initialized && r->entropy_total > 128) { > r->initialized = 1; > r->entropy_total = 0; > - if (r == &nonblocking_pool) { > - prandom_reseed_late(); > - process_random_ready_list(); > - wake_up_all(&urandom_init_wait); > - pr_notice("random: %s pool is initialized\n", r- >name); > - } > } > > trace_credit_entropy_bits(r->name, nbits, > @@ -692,30 +689,27 @@ retry: > if (r == &input_pool) { > int entropy_bits = entropy_count >> ENTROPY_SHIFT; > > + if (crng_init < 3 && entropy_bits >= 128) { > + (void) crng_reseed(r); > + entropy_bits = r->entropy_count >> ENTROPY_SHIFT; > + } > + > /* should we wake readers? */ > if (entropy_bits >= random_read_wakeup_bits) { > wake_up_interruptible(&random_read_wait); > kill_fasync(&fasync, SIGIO, POLL_IN); > } > /* If the input pool is getting full, send some > - * entropy to the two output pools, flipping back and > - * forth between them, until the output pools are 75% > - * full. > + * entropy to the blocking pool until it is 75% full. > */ > if (entropy_bits > random_write_wakeup_bits && > r->initialized && > r->entropy_total >= 2*random_read_wakeup_bits) { > - static struct entropy_store *last = &blocking_pool; > struct entropy_store *other = &blocking_pool; > > - if (last == &blocking_pool) > - other = &nonblocking_pool; > if (other->entropy_count <= > - 3 * other->poolinfo->poolfracbits / 4) > - last = other; > - if (last->entropy_count <= > - 3 * last->poolinfo->poolfracbits / 4) { > - schedule_work(&last->push_work); > + 3 * other->poolinfo->poolfracbits / 4) { > + schedule_work(&other->push_work); > r->entropy_total = 0; > } > } > @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct > entropy_store *r, int nbits) > > /********************************************************************* > * > + * CRNG using CHACHA20 > + * > + *********************************************************************/ > + > +#define CRNG_RESEED_INTERVAL (300*HZ) > + > +struct crng_state { > + __u32 state[16]; > + unsigned long init_time; > + spinlock_t lock; > +}; > + > +struct crng_state primary_crng = { > + .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock), > +}; > +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); > + > +static void _initialize_crng(struct crng_state *crng) > +{ > + int i; > + unsigned long rv; Why do you use unsigned long here? I thought the state[i] is unsigned int. > + > + memcpy(&crng->state[0], "expand 32-byte k", 16); > + for (i = 4; i < 16; i++) { > + if (!arch_get_random_seed_long(&rv) && > + !arch_get_random_long(&rv)) > + rv = random_get_entropy(); > + crng->state[i] ^= rv; > + } > + crng->init_time = jiffies - CRNG_RESEED_INTERVAL; Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to test that the ChaCha20 works? > +} > + > +static void initialize_crng(struct crng_state *crng) > +{ > + _initialize_crng(crng); > + spin_lock_init(&crng->lock); > +} > + > +static int crng_fast_load(__u32 pool[4]) > +{ > + int i; > + __u32 *p; > + > + if (!spin_trylock(&primary_crng.lock)) > + return 0; > + if (crng_ready()) { > + spin_unlock(&primary_crng.lock); > + return 0; > + } > + p = &primary_crng.state[4]; > + if (crng_init == 1) > + p += 4; > + for (i=0; i < 4; i++) > + *p ^= pool[i]; > + if (crng_init++ >= 2) > + wake_up_interruptible(&crng_init_wait); Don't we have a race here with the crng_init < 3 check in crng_reseed considering multi-core systems? > + pr_notice("random: crng_init %d\n", crng_init); > + spin_unlock(&primary_crng.lock); > + return 1; > +} > + > +/* Returns 1 on success */ > +static int crng_reseed(struct entropy_store *r) > +{ > + unsigned long flags; > + int ret = 0; > + int i, num, num_words; > + __u32 tmp[16]; > + > + spin_lock_irqsave(&primary_crng.lock, flags); > + num = extract_entropy(r, tmp, 32, 16, 0); > + if (num == 0) > + goto out; > + if (num < 16 || num > 32) { > + WARN_ON(1); > + pr_err("crng_reseed: num is %d?!?\n", num); > + } > + num_words = (num + 3) / 4; > + for (i = 0; i < num_words; i++) > + primary_crng.state[i+4] ^= tmp[i]; > + primary_crng.init_time = jiffies; > + if (crng_init < 3) { Shouldn't that one be if (crng_init < 3 && num >= 16) ? > + crng_init = 3; > + process_random_ready_list(); > + wake_up_interruptible(&crng_init_wait); > + pr_notice("random: crng_init 3\n"); Would it make sense to be more descriptive here to allow readers of dmesg to understand the output? > + } > + ret = 1; > +out: > + spin_unlock_irqrestore(&primary_crng.lock, flags); memzero_explicit of tmp? > + return ret; > +} > + > +static inline void crng_wait_ready(void) > +{ > + wait_event_interruptible(crng_init_wait, crng_ready()); > +} > + > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > +{ > + unsigned long v, flags; > + struct crng_state *crng = &primary_crng; > + > + if (crng_init > 2 && > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) > + crng_reseed(&input_pool); > + spin_lock_irqsave(&crng->lock, flags); > + if (arch_get_random_long(&v)) > + crng->state[14] ^= v; Again, unsigned int? What is the purpose to only cover the 2nd 32 bit value of the nonce with arch_get_random? > + chacha20_block(&crng->state[0], out); > + if (crng->state[12] == 0) > + crng->state[13]++; state[12]++? Or why do you increment the nonce? > + spin_unlock_irqrestore(&crng->lock, flags); > +} > + > +static ssize_t extract_crng_user(void __user *buf, size_t nbytes) > +{ > + ssize_t ret = 0, i; > + __u8 tmp[CHACHA20_BLOCK_SIZE]; > + int large_request = (nbytes > 256); > + > + while (nbytes) { > + if (large_request && need_resched()) { > + if (signal_pending(current)) { > + if (ret == 0) > + ret = -ERESTARTSYS; > + break; > + } > + schedule(); > + } > + > + extract_crng(tmp); Now I have to wear my (ugly) FIPS heat: we need that code from the current implementation here: if (fips_enabled) { spin_lock_irqsave(&r->lock, flags); if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) panic("Hardware RNG duplicated output!\n"); memcpy(r->last_data, tmp, EXTRACT_SIZE); spin_unlock_irqrestore(&r->lock, flags); } > + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE); > + if (copy_to_user(buf, tmp, i)) { > + ret = -EFAULT; > + break; > + } > + > + nbytes -= i; > + buf += i; > + ret += i; > + } > + > + /* Wipe data just written to memory */ > + memzero_explicit(tmp, sizeof(tmp)); > + > + return ret; > +} > + > + > +/********************************************************************* > + * > * Entropy input management > * > *********************************************************************/ > @@ -749,12 +895,12 @@ struct timer_rand_state { > #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, }; > > /* > - * Add device- or boot-specific data to the input and nonblocking > - * pools to help initialize them to unique values. > + * Add device- or boot-specific data to the input pool to help > + * initialize it. > * > - * None of this adds any entropy, it is meant to avoid the > - * problem of the nonblocking pool having similar initial state > - * across largely identical devices. > + * None of this adds any entropy; it is meant to avoid the problem of > + * the entropy pool having similar initial state across largely > + * identical devices. > */ > void add_device_randomness(const void *buf, unsigned int size) > { > @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned > int size) _mix_pool_bytes(&input_pool, buf, size); > _mix_pool_bytes(&input_pool, &time, sizeof(time)); > spin_unlock_irqrestore(&input_pool.lock, flags); > - > - spin_lock_irqsave(&nonblocking_pool.lock, flags); > - _mix_pool_bytes(&nonblocking_pool, buf, size); > - _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time)); > - spin_unlock_irqrestore(&nonblocking_pool.lock, flags); > } > EXPORT_SYMBOL(add_device_randomness); > > @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state > *state, unsigned num) sample.jiffies = jiffies; > sample.cycles = random_get_entropy(); > sample.num = num; > - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; > + r = &input_pool; > mix_pool_bytes(r, &sample, sizeof(sample)); > > /* > @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags) > !time_after(now, fast_pool->last + HZ)) > return; > > - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; > + if (!crng_ready() && crng_fast_load(fast_pool->pool)) { > + fast_pool->count = 0; > + fast_pool->last = now; > + return; > + } > + > + r = &input_pool; > if (!spin_trylock(&r->lock)) > return; > > @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness); > * > *********************************************************************/ > > -static ssize_t extract_entropy(struct entropy_store *r, void *buf, > - size_t nbytes, int min, int rsvd); > - > /* > * This utility inline function is responsible for transferring entropy > * from the primary pool to the secondary extraction pool. We make > @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct > entropy_store *r, void __user *buf, */ > void get_random_bytes(void *buf, int nbytes) > { > + __u8 tmp[CHACHA20_BLOCK_SIZE]; > + > #if DEBUG_RANDOM_BOOT > 0 > - if (unlikely(nonblocking_pool.initialized == 0)) > + if (!crng_ready()) > printk(KERN_NOTICE "random: %pF get_random_bytes called " > - "with %d bits of entropy available\n", > - (void *) _RET_IP_, > - nonblocking_pool.entropy_total); > + "with crng_init = %d\n", (void *) _RET_IP_, crng_init); > #endif > trace_get_random_bytes(nbytes, _RET_IP_); > - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); > + > + while (nbytes >= CHACHA20_BLOCK_SIZE) { > + extract_crng(buf); > + buf += CHACHA20_BLOCK_SIZE; > + nbytes -= CHACHA20_BLOCK_SIZE; > + } > + > + if (nbytes > 0) { > + extract_crng(tmp); > + memcpy(buf, tmp, nbytes); > + memzero_explicit(tmp, nbytes); > + } > } > EXPORT_SYMBOL(get_random_bytes); > > @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct > random_ready_callback *rdy) unsigned long flags; > int err = -EALREADY; > > - if (likely(nonblocking_pool.initialized)) > + if (crng_ready()) > return err; > > owner = rdy->owner; > @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct > random_ready_callback *rdy) return -ENOENT; > > spin_lock_irqsave(&random_ready_list_lock, flags); > - if (nonblocking_pool.initialized) > + if (crng_ready()) > goto out; > > owner = NULL; > @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes) > } > > if (nbytes) > - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); > + get_random_bytes(p, nbytes); > } > EXPORT_SYMBOL(get_random_bytes_arch); > > @@ -1395,7 +1550,7 @@ static int rand_initialize(void) > { > init_std_data(&input_pool); > init_std_data(&blocking_pool); > - init_std_data(&nonblocking_pool); > + _initialize_crng(&primary_crng); > return 0; > } > early_initcall(rand_initialize); > @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf, > size_t nbytes, loff_t *ppos) { > int ret; > > - if (unlikely(nonblocking_pool.initialized == 0)) > - printk_once(KERN_NOTICE "random: %s urandom read " > - "with %d bits of entropy available\n", > - current->comm, nonblocking_pool.entropy_total); > - > + crng_wait_ready(); Just for clarification: are you now blocking /dev/urandom until the CRNG is filled? That would be a big win. > nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3)); > - ret = extract_entropy_user(&nonblocking_pool, buf, nbytes); > - > - trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool), > - ENTROPY_BITS(&input_pool)); > + ret = extract_crng_user(buf, nbytes); > + trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool)); > return ret; > } > > @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const > char __user *buffer, { > size_t ret; > > - ret = write_pool(&blocking_pool, buffer, count); > - if (ret) > - return ret; > - ret = write_pool(&nonblocking_pool, buffer, count); > + ret = write_pool(&input_pool, buffer, count); > if (ret) > return ret; > > @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int > cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) > return -EPERM; > input_pool.entropy_count = 0; > - nonblocking_pool.entropy_count = 0; > blocking_pool.entropy_count = 0; > return 0; > default: > @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, > size_t, count, if (flags & GRND_RANDOM) > return _random_read(flags & GRND_NONBLOCK, buf, count); > > - if (unlikely(nonblocking_pool.initialized == 0)) { > + if (!crng_ready()) { > if (flags & GRND_NONBLOCK) > return -EAGAIN; > - wait_event_interruptible(urandom_init_wait, > - nonblocking_pool.initialized); > + crng_wait_ready(); > if (signal_pending(current)) > return -ERESTARTSYS; > } > diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h > index 274bbae..20d20f68 100644 > --- a/include/crypto/chacha20.h > +++ b/include/crypto/chacha20.h > @@ -16,6 +16,7 @@ struct chacha20_ctx { > u32 key[8]; > }; > > +void chacha20_block(u32 *state, void *stream); > void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); > int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key, > unsigned int keysize); > diff --git a/lib/Makefile b/lib/Makefile > index 7bd6fd4..9ba27cd 100644 > --- a/lib/Makefile > +++ b/lib/Makefile > @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n > lib-y := ctype.o string.o vsprintf.o cmdline.o \ > rbtree.o radix-tree.o dump_stack.o timerqueue.o\ > idr.o int_sqrt.o extable.o \ > - sha1.o md5.o irq_regs.o argv_split.o \ > + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ > proportions.o flex_proportions.o ratelimit.o show_mem.o \ > is_single_threaded.o plist.o decompress.o kobject_uevent.o \ > earlycpio.o seq_buf.o nmi_backtrace.o > diff --git a/lib/chacha20.c b/lib/chacha20.c > new file mode 100644 > index 0000000..250ceed > --- /dev/null > +++ b/lib/chacha20.c > @@ -0,0 +1,79 @@ > +/* > + * ChaCha20 256-bit cipher algorithm, RFC7539 > + * > + * Copyright (C) 2015 Martin Willi > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + */ > + > +#include <linux/kernel.h> > +#include <linux/export.h> > +#include <linux/bitops.h> > +#include <linux/cryptohash.h> > +#include <asm/unaligned.h> > +#include <crypto/chacha20.h> > + > +static inline u32 rotl32(u32 v, u8 n) > +{ > + return (v << n) | (v >> (sizeof(v) * 8 - n)); > +} > + > +extern void chacha20_block(u32 *state, void *stream) > +{ > + u32 x[16], *out = stream; > + int i; > + > + for (i = 0; i < ARRAY_SIZE(x); i++) > + x[i] = state[i]; > + > + for (i = 0; i < 20; i += 2) { > + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); > + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); > + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); > + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); > + > + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); > + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); > + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); > + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); > + > + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); > + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); > + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); > + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); > + > + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); > + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); > + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); > + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); > + > + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); > + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); > + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); > + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); > + > + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); > + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); > + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); > + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); > + > + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); > + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); > + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); > + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); > + > + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); > + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); > + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); > + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); > + } > + > + for (i = 0; i < ARRAY_SIZE(x); i++) > + out[i] = cpu_to_le32(x[i] + state[i]); > + > + state[12]++; > +} > +EXPORT_SYMBOL(chacha20_block); Ciao Stephan -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o: Hi Theodore, One more item. > The CRNG is faster, and we don't pretend to track entropy usage in the > CRNG any more. > > Signed-off-by: Theodore Ts'o <tytso@mit.edu> > --- > crypto/chacha20_generic.c | 61 ---------- > drivers/char/random.c | 282 > ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h | > 1 + > lib/Makefile | 2 +- > lib/chacha20.c | 79 +++++++++++++ > 5 files changed, 294 insertions(+), 131 deletions(-) > create mode 100644 lib/chacha20.c > > diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c > index da9c899..1cab831 100644 > --- a/crypto/chacha20_generic.c > +++ b/crypto/chacha20_generic.c > @@ -15,72 +15,11 @@ > #include <linux/module.h> > #include <crypto/chacha20.h> > > -static inline u32 rotl32(u32 v, u8 n) > -{ > - return (v << n) | (v >> (sizeof(v) * 8 - n)); > -} > - > static inline u32 le32_to_cpuvp(const void *p) > { > return le32_to_cpup(p); > } > > -static void chacha20_block(u32 *state, void *stream) > -{ > - u32 x[16], *out = stream; > - int i; > - > - for (i = 0; i < ARRAY_SIZE(x); i++) > - x[i] = state[i]; > - > - for (i = 0; i < 20; i += 2) { > - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); > - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); > - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); > - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); > - > - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); > - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); > - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); > - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); > - > - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); > - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); > - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); > - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); > - > - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); > - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); > - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); > - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); > - > - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); > - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); > - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); > - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); > - > - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); > - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); > - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); > - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); > - > - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); > - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); > - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); > - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); > - > - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); > - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); > - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); > - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); > - } > - > - for (i = 0; i < ARRAY_SIZE(x); i++) > - out[i] = cpu_to_le32(x[i] + state[i]); > - > - state[12]++; > -} > - > static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, > unsigned int bytes) > { > diff --git a/drivers/char/random.c b/drivers/char/random.c > index b583e53..95f4451 100644 > --- a/drivers/char/random.c > +++ b/drivers/char/random.c > @@ -260,6 +260,7 @@ > #include <linux/irq.h> > #include <linux/syscalls.h> > #include <linux/completion.h> > +#include <crypto/chacha20.h> > > #include <asm/processor.h> > #include <asm/uaccess.h> > @@ -412,6 +413,15 @@ static struct fasync_struct *fasync; > static DEFINE_SPINLOCK(random_ready_list_lock); > static LIST_HEAD(random_ready_list); > > +/* > + * crng_init = 0 --> Uninitialized > + * 2 --> Initialized > + * 3 --> Initialized from input_pool > + */ > +static int crng_init = 0; > +#define crng_ready() (likely(crng_init >= 2)) > +static void process_random_ready_list(void); > + > /********************************************************************** > * > * OS independent entropy store. Here are the functions which handle > @@ -441,10 +451,13 @@ struct entropy_store { > __u8 last_data[EXTRACT_SIZE]; > }; > > +static ssize_t extract_entropy(struct entropy_store *r, void *buf, > + size_t nbytes, int min, int rsvd); > + > +static int crng_reseed(struct entropy_store *r); > static void push_to_pool(struct work_struct *work); > static __u32 input_pool_data[INPUT_POOL_WORDS]; > static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; > -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS]; > > static struct entropy_store input_pool = { > .poolinfo = &poolinfo_table[0], > @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = { > push_to_pool), > }; > > -static struct entropy_store nonblocking_pool = { > - .poolinfo = &poolinfo_table[1], > - .name = "nonblocking", > - .pull = &input_pool, > - .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock), > - .pool = nonblocking_pool_data, > - .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work, > - push_to_pool), > -}; > - > static __u32 const twist_table[8] = { > 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, > 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; > @@ -677,12 +680,6 @@ retry: > if (!r->initialized && r->entropy_total > 128) { > r->initialized = 1; > r->entropy_total = 0; > - if (r == &nonblocking_pool) { > - prandom_reseed_late(); > - process_random_ready_list(); > - wake_up_all(&urandom_init_wait); > - pr_notice("random: %s pool is initialized\n", r- >name); > - } > } > > trace_credit_entropy_bits(r->name, nbits, > @@ -692,30 +689,27 @@ retry: > if (r == &input_pool) { > int entropy_bits = entropy_count >> ENTROPY_SHIFT; > > + if (crng_init < 3 && entropy_bits >= 128) { > + (void) crng_reseed(r); > + entropy_bits = r->entropy_count >> ENTROPY_SHIFT; > + } > + > /* should we wake readers? */ > if (entropy_bits >= random_read_wakeup_bits) { > wake_up_interruptible(&random_read_wait); > kill_fasync(&fasync, SIGIO, POLL_IN); > } > /* If the input pool is getting full, send some > - * entropy to the two output pools, flipping back and > - * forth between them, until the output pools are 75% > - * full. > + * entropy to the blocking pool until it is 75% full. > */ > if (entropy_bits > random_write_wakeup_bits && > r->initialized && > r->entropy_total >= 2*random_read_wakeup_bits) { > - static struct entropy_store *last = &blocking_pool; > struct entropy_store *other = &blocking_pool; > > - if (last == &blocking_pool) > - other = &nonblocking_pool; > if (other->entropy_count <= > - 3 * other->poolinfo->poolfracbits / 4) > - last = other; > - if (last->entropy_count <= > - 3 * last->poolinfo->poolfracbits / 4) { > - schedule_work(&last->push_work); > + 3 * other->poolinfo->poolfracbits / 4) { > + schedule_work(&other->push_work); > r->entropy_total = 0; > } > } > @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct > entropy_store *r, int nbits) > > /********************************************************************* > * > + * CRNG using CHACHA20 > + * > + *********************************************************************/ > + > +#define CRNG_RESEED_INTERVAL (300*HZ) > + > +struct crng_state { > + __u32 state[16]; > + unsigned long init_time; > + spinlock_t lock; > +}; > + > +struct crng_state primary_crng = { > + .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock), > +}; > +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); > + > +static void _initialize_crng(struct crng_state *crng) > +{ > + int i; > + unsigned long rv; > + > + memcpy(&crng->state[0], "expand 32-byte k", 16); > + for (i = 4; i < 16; i++) { > + if (!arch_get_random_seed_long(&rv) && > + !arch_get_random_long(&rv)) > + rv = random_get_entropy(); > + crng->state[i] ^= rv; > + } > + crng->init_time = jiffies - CRNG_RESEED_INTERVAL; > +} > + > +static void initialize_crng(struct crng_state *crng) > +{ > + _initialize_crng(crng); > + spin_lock_init(&crng->lock); > +} > + > +static int crng_fast_load(__u32 pool[4]) > +{ > + int i; > + __u32 *p; > + > + if (!spin_trylock(&primary_crng.lock)) > + return 0; > + if (crng_ready()) { > + spin_unlock(&primary_crng.lock); > + return 0; > + } > + p = &primary_crng.state[4]; > + if (crng_init == 1) > + p += 4; > + for (i=0; i < 4; i++) > + *p ^= pool[i]; > + if (crng_init++ >= 2) > + wake_up_interruptible(&crng_init_wait); > + pr_notice("random: crng_init %d\n", crng_init); > + spin_unlock(&primary_crng.lock); > + return 1; > +} > + > +/* Returns 1 on success */ > +static int crng_reseed(struct entropy_store *r) > +{ > + unsigned long flags; > + int ret = 0; > + int i, num, num_words; > + __u32 tmp[16]; > + > + spin_lock_irqsave(&primary_crng.lock, flags); > + num = extract_entropy(r, tmp, 32, 16, 0); > + if (num == 0) > + goto out; > + if (num < 16 || num > 32) { > + WARN_ON(1); > + pr_err("crng_reseed: num is %d?!?\n", num); > + } > + num_words = (num + 3) / 4; > + for (i = 0; i < num_words; i++) > + primary_crng.state[i+4] ^= tmp[i]; > + primary_crng.init_time = jiffies; > + if (crng_init < 3) { > + crng_init = 3; > + process_random_ready_list(); > + wake_up_interruptible(&crng_init_wait); > + pr_notice("random: crng_init 3\n"); > + } > + ret = 1; > +out: > + spin_unlock_irqrestore(&primary_crng.lock, flags); > + return ret; > +} > + > +static inline void crng_wait_ready(void) > +{ > + wait_event_interruptible(crng_init_wait, crng_ready()); > +} > + > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > +{ > + unsigned long v, flags; > + struct crng_state *crng = &primary_crng; > + > + if (crng_init > 2 && > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) > + crng_reseed(&input_pool); > + spin_lock_irqsave(&crng->lock, flags); > + if (arch_get_random_long(&v)) > + crng->state[14] ^= v; > + chacha20_block(&crng->state[0], out); > + if (crng->state[12] == 0) > + crng->state[13]++; > + spin_unlock_irqrestore(&crng->lock, flags); > +} > + > +static ssize_t extract_crng_user(void __user *buf, size_t nbytes) > +{ > + ssize_t ret = 0, i; > + __u8 tmp[CHACHA20_BLOCK_SIZE]; > + int large_request = (nbytes > 256); > + > + while (nbytes) { > + if (large_request && need_resched()) { > + if (signal_pending(current)) { > + if (ret == 0) > + ret = -ERESTARTSYS; > + break; > + } > + schedule(); > + } > + > + extract_crng(tmp); > + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE); > + if (copy_to_user(buf, tmp, i)) { > + ret = -EFAULT; > + break; > + } > + > + nbytes -= i; > + buf += i; > + ret += i; > + } > + > + /* Wipe data just written to memory */ > + memzero_explicit(tmp, sizeof(tmp)); Would it make sense to add another chacha20_block() call here at the end? Note, the one thing about the SP800-90A DRBG I really like is the enhanced backward secrecy support which is implemented by "updating" the internal state (the key / state) used for one or more random number generation rounds after one request for random numbers is satisfied. This means that even if the state becomes known or the subsequent caller manages to deduce the state of the RNG to some degree of confidence, he cannot backtrack the already generated random numbers. I see that the ChaCha20 RNG implicitly updates its state while it operates. But for the last round of the RNG, there is no more shuffling of the internal state. As one round is 64 bytes in size and many callers just want 16 or 32 bytes (as seen during testing), a lot of callers trigger only one round of the RNG. > + > + return ret; > +} > + > + > +/********************************************************************* > + * > * Entropy input management > * > *********************************************************************/ > @@ -749,12 +895,12 @@ struct timer_rand_state { > #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, }; > > /* > - * Add device- or boot-specific data to the input and nonblocking > - * pools to help initialize them to unique values. > + * Add device- or boot-specific data to the input pool to help > + * initialize it. > * > - * None of this adds any entropy, it is meant to avoid the > - * problem of the nonblocking pool having similar initial state > - * across largely identical devices. > + * None of this adds any entropy; it is meant to avoid the problem of > + * the entropy pool having similar initial state across largely > + * identical devices. > */ > void add_device_randomness(const void *buf, unsigned int size) > { > @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned > int size) _mix_pool_bytes(&input_pool, buf, size); > _mix_pool_bytes(&input_pool, &time, sizeof(time)); > spin_unlock_irqrestore(&input_pool.lock, flags); > - > - spin_lock_irqsave(&nonblocking_pool.lock, flags); > - _mix_pool_bytes(&nonblocking_pool, buf, size); > - _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time)); > - spin_unlock_irqrestore(&nonblocking_pool.lock, flags); > } > EXPORT_SYMBOL(add_device_randomness); > > @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state > *state, unsigned num) sample.jiffies = jiffies; > sample.cycles = random_get_entropy(); > sample.num = num; > - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; > + r = &input_pool; > mix_pool_bytes(r, &sample, sizeof(sample)); > > /* > @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags) > !time_after(now, fast_pool->last + HZ)) > return; > > - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; > + if (!crng_ready() && crng_fast_load(fast_pool->pool)) { > + fast_pool->count = 0; > + fast_pool->last = now; > + return; > + } > + > + r = &input_pool; > if (!spin_trylock(&r->lock)) > return; > > @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness); > * > *********************************************************************/ > > -static ssize_t extract_entropy(struct entropy_store *r, void *buf, > - size_t nbytes, int min, int rsvd); > - > /* > * This utility inline function is responsible for transferring entropy > * from the primary pool to the secondary extraction pool. We make > @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct > entropy_store *r, void __user *buf, */ > void get_random_bytes(void *buf, int nbytes) > { > + __u8 tmp[CHACHA20_BLOCK_SIZE]; > + > #if DEBUG_RANDOM_BOOT > 0 > - if (unlikely(nonblocking_pool.initialized == 0)) > + if (!crng_ready()) > printk(KERN_NOTICE "random: %pF get_random_bytes called " > - "with %d bits of entropy available\n", > - (void *) _RET_IP_, > - nonblocking_pool.entropy_total); > + "with crng_init = %d\n", (void *) _RET_IP_, crng_init); > #endif > trace_get_random_bytes(nbytes, _RET_IP_); > - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); > + > + while (nbytes >= CHACHA20_BLOCK_SIZE) { > + extract_crng(buf); > + buf += CHACHA20_BLOCK_SIZE; > + nbytes -= CHACHA20_BLOCK_SIZE; > + } > + > + if (nbytes > 0) { > + extract_crng(tmp); > + memcpy(buf, tmp, nbytes); > + memzero_explicit(tmp, nbytes); > + } dto here. > } > EXPORT_SYMBOL(get_random_bytes); > > @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct > random_ready_callback *rdy) unsigned long flags; > int err = -EALREADY; > > - if (likely(nonblocking_pool.initialized)) > + if (crng_ready()) > return err; > > owner = rdy->owner; > @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct > random_ready_callback *rdy) return -ENOENT; > > spin_lock_irqsave(&random_ready_list_lock, flags); > - if (nonblocking_pool.initialized) > + if (crng_ready()) > goto out; > > owner = NULL; > @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes) > } > > if (nbytes) > - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); > + get_random_bytes(p, nbytes); > } > EXPORT_SYMBOL(get_random_bytes_arch); > > @@ -1395,7 +1550,7 @@ static int rand_initialize(void) > { > init_std_data(&input_pool); > init_std_data(&blocking_pool); > - init_std_data(&nonblocking_pool); > + _initialize_crng(&primary_crng); > return 0; > } > early_initcall(rand_initialize); > @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf, > size_t nbytes, loff_t *ppos) { > int ret; > > - if (unlikely(nonblocking_pool.initialized == 0)) > - printk_once(KERN_NOTICE "random: %s urandom read " > - "with %d bits of entropy available\n", > - current->comm, nonblocking_pool.entropy_total); > - > + crng_wait_ready(); > nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3)); > - ret = extract_entropy_user(&nonblocking_pool, buf, nbytes); > - > - trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool), > - ENTROPY_BITS(&input_pool)); > + ret = extract_crng_user(buf, nbytes); > + trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool)); > return ret; > } > > @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const > char __user *buffer, { > size_t ret; > > - ret = write_pool(&blocking_pool, buffer, count); > - if (ret) > - return ret; > - ret = write_pool(&nonblocking_pool, buffer, count); > + ret = write_pool(&input_pool, buffer, count); > if (ret) > return ret; > > @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int > cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) > return -EPERM; > input_pool.entropy_count = 0; > - nonblocking_pool.entropy_count = 0; > blocking_pool.entropy_count = 0; > return 0; > default: > @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, > size_t, count, if (flags & GRND_RANDOM) > return _random_read(flags & GRND_NONBLOCK, buf, count); > > - if (unlikely(nonblocking_pool.initialized == 0)) { > + if (!crng_ready()) { > if (flags & GRND_NONBLOCK) > return -EAGAIN; > - wait_event_interruptible(urandom_init_wait, > - nonblocking_pool.initialized); > + crng_wait_ready(); > if (signal_pending(current)) > return -ERESTARTSYS; > } > diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h > index 274bbae..20d20f68 100644 > --- a/include/crypto/chacha20.h > +++ b/include/crypto/chacha20.h > @@ -16,6 +16,7 @@ struct chacha20_ctx { > u32 key[8]; > }; > > +void chacha20_block(u32 *state, void *stream); > void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); > int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key, > unsigned int keysize); > diff --git a/lib/Makefile b/lib/Makefile > index 7bd6fd4..9ba27cd 100644 > --- a/lib/Makefile > +++ b/lib/Makefile > @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n > lib-y := ctype.o string.o vsprintf.o cmdline.o \ > rbtree.o radix-tree.o dump_stack.o timerqueue.o\ > idr.o int_sqrt.o extable.o \ > - sha1.o md5.o irq_regs.o argv_split.o \ > + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ > proportions.o flex_proportions.o ratelimit.o show_mem.o \ > is_single_threaded.o plist.o decompress.o kobject_uevent.o \ > earlycpio.o seq_buf.o nmi_backtrace.o > diff --git a/lib/chacha20.c b/lib/chacha20.c > new file mode 100644 > index 0000000..250ceed > --- /dev/null > +++ b/lib/chacha20.c > @@ -0,0 +1,79 @@ > +/* > + * ChaCha20 256-bit cipher algorithm, RFC7539 > + * > + * Copyright (C) 2015 Martin Willi > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + */ > + > +#include <linux/kernel.h> > +#include <linux/export.h> > +#include <linux/bitops.h> > +#include <linux/cryptohash.h> > +#include <asm/unaligned.h> > +#include <crypto/chacha20.h> > + > +static inline u32 rotl32(u32 v, u8 n) > +{ > + return (v << n) | (v >> (sizeof(v) * 8 - n)); > +} > + > +extern void chacha20_block(u32 *state, void *stream) > +{ > + u32 x[16], *out = stream; > + int i; > + > + for (i = 0; i < ARRAY_SIZE(x); i++) > + x[i] = state[i]; > + > + for (i = 0; i < 20; i += 2) { > + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); > + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); > + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); > + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); > + > + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); > + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); > + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); > + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); > + > + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); > + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); > + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); > + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); > + > + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); > + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); > + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); > + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); > + > + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); > + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); > + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); > + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); > + > + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); > + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); > + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); > + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); > + > + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); > + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); > + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); > + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); > + > + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); > + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); > + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); > + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); > + } > + > + for (i = 0; i < ARRAY_SIZE(x); i++) > + out[i] = cpu_to_le32(x[i] + state[i]); > + > + state[12]++; > +} > +EXPORT_SYMBOL(chacha20_block); Ciao Stephan -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Am Dienstag, 3. Mai 2016, 11:36:12 schrieb Stephan Mueller: Hi Ted, > > + > > +static ssize_t extract_crng_user(void __user *buf, size_t nbytes) > > +{ > > + ssize_t ret = 0, i; > > + __u8 tmp[CHACHA20_BLOCK_SIZE]; > > + int large_request = (nbytes > 256); > > + > > + while (nbytes) { > > + if (large_request && need_resched()) { > > + if (signal_pending(current)) { > > + if (ret == 0) > > + ret = -ERESTARTSYS; > > + break; > > + } > > + schedule(); > > + } > > + > > + extract_crng(tmp); > > + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE); > > + if (copy_to_user(buf, tmp, i)) { > > + ret = -EFAULT; > > + break; > > + } > > + > > + nbytes -= i; > > + buf += i; > > + ret += i; > > + } > > + > > + /* Wipe data just written to memory */ > > + memzero_explicit(tmp, sizeof(tmp)); > > Would it make sense to add another chacha20_block() call here at the end? > Note, the one thing about the SP800-90A DRBG I really like is the enhanced > backward secrecy support which is implemented by "updating" the internal > state (the key / state) used for one or more random number generation > rounds after one request for random numbers is satisfied. > > This means that even if the state becomes known or the subsequent caller > manages to deduce the state of the RNG to some degree of confidence, he > cannot backtrack the already generated random numbers. > > I see that the ChaCha20 RNG implicitly updates its state while it operates. > But for the last round of the RNG, there is no more shuffling of the > internal state. As one round is 64 bytes in size and many callers just want > 16 or 32 bytes (as seen during testing), a lot of callers trigger only one > round of the RNG. After doing some performance tests, I see that we reach a performance of north of 200 MB/s on my system (compare that to 12 MB/s for the SHA-1 version). Thus, I would assume adding another call to chacha20_block should not hurt. Ciao Stephan -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> +static inline u32 rotl32(u32 v, u8 n) > +{ > + return (v << n) | (v >> (sizeof(v) * 8 - n)); > +} That's undefined behavior when n=0. I think the portable way to do a rotate that avoids UB is the following. GCC, Clang and ICC recognize the pattern, and emit a rotate instruction. static const unsigned int MASK=31; return (v<<n)|(v>>(-n&MASK)); You should also avoid the following because its not constant time due to the branch: return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n)); Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
>> + chacha20_block(&crng->state[0], out); >> + if (crng->state[12] == 0) >> + crng->state[13]++; > > state[12]++? Or why do you increment the nonce? In Bernstein's Salsa and ChaCha, the counter is 64-bit. It appears ChaCha-TLS uses a 32-bit counter, and the other 32-bits is given to the nonce. Maybe the first question to ask is, what ChaCha is the kernel providing? If its ChaCha-TLS, then the carry does not make a lot of sense. If the generator is limiting the amount of material under a given set of security parameters (key and nonce), then the generator will likely re-key itself long before the 256-GB induced wrap. In this case, it does not matter which ChaCha the kernel is providing and the carry is superfluous. Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote: > > +/* > > + * crng_init = 0 --> Uninitialized > > + * 2 --> Initialized > > + * 3 --> Initialized from input_pool > > + */ > > +static int crng_init = 0; > > shouldn't that be an atomic_t ? The crng_init variable only gets incremented (it progresses from 0->1->2->3) and it's protected by the primary_crng->lock spinlock. There are a few places where we read it without first taking the lock, but that's where we depend on it getting incremented and where if we race with another CPU which has just bumped the crng_init status, it's not critical. (Or we can take the lock and then recheck the crng_init if we really need to be sure.) > > +static void _initialize_crng(struct crng_state *crng) > > +{ > > + int i; > > + unsigned long rv; > > Why do you use unsigned long here? I thought the state[i] is unsigned int. Because it gets filled in via arch_get_random_seed_long(&rv) and arch_get_random_log(&rv). If that means we get 64 bits and we only use 32 bits, that's OK.... > Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to > test that the ChaCha20 works? We're not doing that for any of the other ciphers and hashes. We didn't do that for SHA1, and the chacha20 code where I took this from didn't check for this as well. What threat are you most worried about. We don't care about chacha20 being exactly chacha20, so long as it's cryptographically strong. In fact I think I removed a potential host order byteswap in the set key operation specifically because we don't care and interop. If this is required for FIPS mode, we can add that later. I was primarily trying to keep the patch small to be easier for people to look at it, so I've deliberately left off bells and whistles that aren't strictly needed to show that the new approach is sound. > > + if (crng_init++ >= 2) > > + wake_up_interruptible(&crng_init_wait); > > Don't we have a race here with the crng_init < 3 check in crng_reseed > considering multi-core systems? No, because we are holding the primary_crng->lock spinlock. I'll add a comment explaining the locking protections which is intended for crng_init where we declare it. > > + if (num < 16 || num > 32) { > > + WARN_ON(1); > > + pr_err("crng_reseed: num is %d?!?\n", num); > > + } > > + num_words = (num + 3) / 4; > > + for (i = 0; i < num_words; i++) > > + primary_crng.state[i+4] ^= tmp[i]; > > + primary_crng.init_time = jiffies; > > + if (crng_init < 3) { > > Shouldn't that one be if (crng_init < 3 && num >= 16) ? I'll just change the above WRN_ON test to be: BUG_ON(num < 16 || num > 32); This really can't happen, and I had it as a WARN_ON with a printk for debugging purpose in case I was wrong about how the code works. > > + crng_init = 3; > > + process_random_ready_list(); > > + wake_up_interruptible(&crng_init_wait); > > + pr_notice("random: crng_init 3\n"); > > Would it make sense to be more descriptive here to allow readers of dmesg to > understand the output? Yes, what we're currently printing during the initialization: random: crng_init 1 random: crng_init 2 random: crng_init 3 was again mostly for debugging purposes. What I'm thinking about doing is changing crng_init 2 and 3 messages to be: random: crng fast init done random: crng conservative init done > > + } > > + ret = 1; > > +out: > > + spin_unlock_irqrestore(&primary_crng.lock, flags); > > memzero_explicit of tmp? Good point, I've added a call to memzero_explicit(). > > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > > +{ > > + unsigned long v, flags; > > + struct crng_state *crng = &primary_crng; > > + > > + if (crng_init > 2 && > > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) > > + crng_reseed(&input_pool); > > + spin_lock_irqsave(&crng->lock, flags); > > + if (arch_get_random_long(&v)) > > + crng->state[14] ^= v; > > + chacha20_block(&crng->state[0], out); > > + if (crng->state[12] == 0) > > + crng->state[13]++; > What is the purpose to only cover the 2nd 32 bit value of the nonce with > arch_get_random? > > state[12]++? Or why do you increment the nonce? In Chacha20, its output is a funcrtion of the state array, where state[0..3] is a constant specified by the Chacha20 definition, state[4..11] is the Key, and state[12..15] is the IV. The chacha20_block() function increments state[12] each time it is called, so state[12] is being used as the block counter. The increment of state[13] is used to make state[13] to be the upper 32-bits of the block counter. We *should* be reseeding more often than 2**32 calls to chacha20_block(), and the increment is just to be safe in case something goes wronng and we're not reseeding. We're using crng[14] to be contain the output of RDRAND, so this is where we mix in the contribution from a CPU-level hwrng. Previously we called RDRAND multiple times and XOR'ed the results into the output. This is a bit faster and is more comforting for paranoiacs who are concerned that Intel might have down something shifty enough to be able to read from memory and change the output of RDRAND to force a particular output from the RNG. Ware using state[15] to contain the NUMA node id. This was because originally the used used the same key bytes (state[4..11]) between different NUMA state arrays, so state[15] was used to guarantee that state arrays would be different between different NUMA node state arrays. Now that we are deriving the key from a primary_crng, we could use state[15] for something else, including as a second destination from RDRAND. > Now I have to wear my (ugly) FIPS heat: we need that code from the current > implementation here: > > if (fips_enabled) { > spin_lock_irqsave(&r->lock, flags); > if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) > panic("Hardware RNG duplicated output!\n"); > memcpy(r->last_data, tmp, EXTRACT_SIZE); > spin_unlock_irqrestore(&r->lock, flags); > } > I'll add FIPS support as a separate patch. I personally consider FIPS support to be a distraction, and it's only useful for people who need to sell to governments (mostly US governments). > > - if (unlikely(nonblocking_pool.initialized == 0)) > > - printk_once(KERN_NOTICE "random: %s urandom read " > > - "with %d bits of entropy available\n", > > - current->comm, nonblocking_pool.entropy_total); > > - > > + crng_wait_ready(); > > Just for clarification: are you now blocking /dev/urandom until the CRNG is > filled? That would be a big win. Until the the fast init state, yes. In practice we are blocking until 128 interrupts have occurred, which during boot is hapens quickly enough that even on a simple KVM system, this happens before userspace starts up. There *is* a risk here, though. Imagine a embedded platform with very few interrupt-driven devices so device probing isn't enough to make the CRNG ready when the initial ramdisk starts up, and the init program tries to read from /dev/urandom --- which then blocks, potentially indefinitely. If that happens, then we will have broken userspace, and we may need to revert this part of the change. But I'm willing to take the risk, in hopes that such super-simplisitic devices don't exist in real life, and if they do, the IOT devices will probably be blithely ignoring cryptographic concerns so much that they aren't using /dev/urandom anyway. :-) >Would it make sense to add another chacha20_block() call here at the end? >Note, the one thing about the SP800-90A DRBG I really like is the enhanced >backward secrecy support which is implemented by "updating" the internal state >(the key / state) used for one or more random number generation rounds after >one request for random numbers is satisfied. > >This means that even if the state becomes known or the subsequent caller >manages to deduce the state of the RNG to some degree of confidence, he cannot >backtrack the already generated random numbers. That's a good idea; being able to prevent back-tracking attacks is a good thing. I'll add this in the next version. Cheers, - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote: > > +static inline u32 rotl32(u32 v, u8 n) > > +{ > > + return (v << n) | (v >> (sizeof(v) * 8 - n)); > > +} > > That's undefined behavior when n=0. Sure, but it's never called with n = 0; I've double checked and the compiler seems to do the right thing with the above pattern as well. Hmm, it looks like there is a "standard" version rotate left and right defined in include/linux/bitops.h. So I suspect it would make sense to use rol32 as defined in bitops.h --- and this is probably something that we should do for the rest of crypto/*.c, where people seem to be defininig their own version of something like rotl32 (I copied the contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern of defining one's own version of rol32 isn't new). > I think the portable way to do a rotate that avoids UB is the > following. GCC, Clang and ICC recognize the pattern, and emit a rotate > instruction. > > static const unsigned int MASK=31; > return (v<<n)|(v>>(-n&MASK)); > > You should also avoid the following because its not constant time due > to the branch: > > return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n)); > Where is this coming from? I don't see this construct in the patch. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 4, 2016 10:30:41 AM PDT, tytso@mit.edu wrote: >On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote: >> > +/* >> > + * crng_init = 0 --> Uninitialized >> > + * 2 --> Initialized >> > + * 3 --> Initialized from input_pool >> > + */ >> > +static int crng_init = 0; >> >> shouldn't that be an atomic_t ? > >The crng_init variable only gets incremented (it progresses from >0->1->2->3) and it's protected by the primary_crng->lock spinlock. >There are a few places where we read it without first taking the lock, >but that's where we depend on it getting incremented and where if we >race with another CPU which has just bumped the crng_init status, it's >not critical. (Or we can take the lock and then recheck the crng_init >if we really need to be sure.) > >> > +static void _initialize_crng(struct crng_state *crng) >> > +{ >> > + int i; >> > + unsigned long rv; >> >> Why do you use unsigned long here? I thought the state[i] is unsigned >int. > >Because it gets filled in via arch_get_random_seed_long(&rv) and >arch_get_random_log(&rv). If that means we get 64 bits and we only >use 32 bits, that's OK.... > >> Would it make sense to add the ChaCha20 self test vectors from >RFC7539 here to >> test that the ChaCha20 works? > >We're not doing that for any of the other ciphers and hashes. We >didn't do that for SHA1, and the chacha20 code where I took this from >didn't check for this as well. What threat are you most worried >about. We don't care about chacha20 being exactly chacha20, so long >as it's cryptographically strong. In fact I think I removed a >potential host order byteswap in the set key operation specifically >because we don't care and interop. > >If this is required for FIPS mode, we can add that later. I was >primarily trying to keep the patch small to be easier for people to >look at it, so I've deliberately left off bells and whistles that >aren't strictly needed to show that the new approach is sound. > >> > + if (crng_init++ >= 2) >> > + wake_up_interruptible(&crng_init_wait); >> >> Don't we have a race here with the crng_init < 3 check in crng_reseed > >> considering multi-core systems? > >No, because we are holding the primary_crng->lock spinlock. I'll add >a comment explaining the locking protections which is intended for >crng_init where we declare it. > > >> > + if (num < 16 || num > 32) { >> > + WARN_ON(1); >> > + pr_err("crng_reseed: num is %d?!?\n", num); >> > + } >> > + num_words = (num + 3) / 4; >> > + for (i = 0; i < num_words; i++) >> > + primary_crng.state[i+4] ^= tmp[i]; >> > + primary_crng.init_time = jiffies; >> > + if (crng_init < 3) { >> >> Shouldn't that one be if (crng_init < 3 && num >= 16) ? > >I'll just change the above WRN_ON test to be: > > BUG_ON(num < 16 || num > 32); > >This really can't happen, and I had it as a WARN_ON with a printk for >debugging purpose in case I was wrong about how the code works. > >> > + crng_init = 3; >> > + process_random_ready_list(); >> > + wake_up_interruptible(&crng_init_wait); >> > + pr_notice("random: crng_init 3\n"); >> >> Would it make sense to be more descriptive here to allow readers of >dmesg to >> understand the output? > >Yes, what we're currently printing during the initialization: > >random: crng_init 1 >random: crng_init 2 >random: crng_init 3 > >was again mostly for debugging purposes. What I'm thinking about >doing is changing crng_init 2 and 3 messages to be: > >random: crng fast init done >random: crng conservative init done > >> > + } >> > + ret = 1; >> > +out: >> > + spin_unlock_irqrestore(&primary_crng.lock, flags); >> >> memzero_explicit of tmp? > >Good point, I've added a call to memzero_explicit(). > >> > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) >> > +{ >> > + unsigned long v, flags; >> > + struct crng_state *crng = &primary_crng; >> > + >> > + if (crng_init > 2 && >> > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) >> > + crng_reseed(&input_pool); >> > + spin_lock_irqsave(&crng->lock, flags); >> > + if (arch_get_random_long(&v)) >> > + crng->state[14] ^= v; >> > + chacha20_block(&crng->state[0], out); >> > + if (crng->state[12] == 0) >> > + crng->state[13]++; > >> What is the purpose to only cover the 2nd 32 bit value of the nonce >with >> arch_get_random? >> >> state[12]++? Or why do you increment the nonce? > >In Chacha20, its output is a funcrtion of the state array, where >state[0..3] is a constant specified by the Chacha20 definition, >state[4..11] is the Key, and state[12..15] is the IV. The >chacha20_block() function increments state[12] each time it is called, >so state[12] is being used as the block counter. The increment of >state[13] is used to make state[13] to be the upper 32-bits of the >block counter. We *should* be reseeding more often than 2**32 calls >to chacha20_block(), and the increment is just to be safe in case >something goes wronng and we're not reseeding. > >We're using crng[14] to be contain the output of RDRAND, so this is >where we mix in the contribution from a CPU-level hwrng. Previously >we called RDRAND multiple times and XOR'ed the results into the >output. This is a bit faster and is more comforting for paranoiacs >who are concerned that Intel might have down something shifty enough >to be able to read from memory and change the output of RDRAND to >force a particular output from the RNG. > >Ware using state[15] to contain the NUMA node id. This was because >originally the used used the same key bytes (state[4..11]) between >different NUMA state arrays, so state[15] was used to guarantee that >state arrays would be different between different NUMA node state >arrays. Now that we are deriving the key from a primary_crng, we >could use state[15] for something else, including as a second >destination from RDRAND. > >> Now I have to wear my (ugly) FIPS heat: we need that code from the >current >> implementation here: >> >> if (fips_enabled) { >> spin_lock_irqsave(&r->lock, flags); >> if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) >> panic("Hardware RNG duplicated >output!\n"); >> memcpy(r->last_data, tmp, EXTRACT_SIZE); >> spin_unlock_irqrestore(&r->lock, flags); >> } >> > >I'll add FIPS support as a separate patch. I personally consider FIPS >support to be a distraction, and it's only useful for people who need >to sell to governments (mostly US governments). > >> > - if (unlikely(nonblocking_pool.initialized == 0)) >> > - printk_once(KERN_NOTICE "random: %s urandom read " >> > - "with %d bits of entropy available\n", >> > - current->comm, nonblocking_pool.entropy_total); >> > - >> > + crng_wait_ready(); >> >> Just for clarification: are you now blocking /dev/urandom until the >CRNG is >> filled? That would be a big win. > >Until the the fast init state, yes. In practice we are blocking until >128 interrupts have occurred, which during boot is hapens quickly >enough that even on a simple KVM system, this happens before userspace >starts up. There *is* a risk here, though. Imagine a embedded >platform with very few interrupt-driven devices so device probing >isn't enough to make the CRNG ready when the initial ramdisk starts >up, and the init program tries to read from /dev/urandom --- which >then blocks, potentially indefinitely. > >If that happens, then we will have broken userspace, and we may need >to revert this part of the change. But I'm willing to take the risk, >in hopes that such super-simplisitic devices don't exist in real life, >and if they do, the IOT devices will probably be blithely ignoring >cryptographic concerns so much that they aren't using /dev/urandom >anyway. :-) > >>Would it make sense to add another chacha20_block() call here at the >end? >>Note, the one thing about the SP800-90A DRBG I really like is the >enhanced >>backward secrecy support which is implemented by "updating" the >internal state >>(the key / state) used for one or more random number generation rounds >after >>one request for random numbers is satisfied. >> >>This means that even if the state becomes known or the subsequent >caller >>manages to deduce the state of the RNG to some degree of confidence, >he cannot >>backtrack the already generated random numbers. > >That's a good idea; being able to prevent back-tracking attacks is >a good thing. I'll add this in the next version. > >Cheers, > > - Ted Why not use arch_get_random*_int()
On Wed, May 4, 2016 at 1:49 PM, <tytso@mit.edu> wrote: > On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote: >> > +static inline u32 rotl32(u32 v, u8 n) >> > +{ >> > + return (v << n) | (v >> (sizeof(v) * 8 - n)); >> > +} >> >> That's undefined behavior when n=0. > > Sure, but it's never called with n = 0; I've double checked and the > compiler seems to do the right thing with the above pattern as well. > Hmm, it looks like there is a "standard" version rotate left and right > defined in include/linux/bitops.h. So I suspect it would make sense > to use rol32 as defined in bitops.h --- and this is probably something bitops.h could work in this case, but its not an ideal solution. GCC does not optimize the code below as expected under all use cases because GCC does not recognize it as a rotate (see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): return (v << n) | (v >> (sizeof(v) * 8 - n)); And outside of special cases like Salsa, ChaCha and BLAKE2, the code provided in bitops.h suffers UB on arbitrary data. So I think care needs to be taken when selecting functions from bitops.h. > that we should do for the rest of crypto/*.c, where people seem to be > defininig their own version of something like rotl32 (I copied the > contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern > of defining one's own version of rol32 isn't new). Yeah, I kind of thought there was some duplication going on. But I think bitops.h should be fixed. Many folks don't realize the lurking UB, and many folks don't realize its not always optimized well. >> I think the portable way to do a rotate that avoids UB is the >> following. GCC, Clang and ICC recognize the pattern, and emit a rotate >> instruction. >> >> static const unsigned int MASK=31; >> return (v<<n)|(v>>(-n&MASK)); >> >> You should also avoid the following because its not constant time due >> to the branch: >> >> return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n)); >> > > Where is this coming from? I don't see this construct in the patch. My bad... It was a general observation. I've seen folks try to correct the UB by turning to something like that. Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 4, 2016 11:22:25 AM PDT, Jeffrey Walton <noloader@gmail.com> wrote: >On Wed, May 4, 2016 at 1:49 PM, <tytso@mit.edu> wrote: >> On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote: >>> > +static inline u32 rotl32(u32 v, u8 n) >>> > +{ >>> > + return (v << n) | (v >> (sizeof(v) * 8 - n)); >>> > +} >>> >>> That's undefined behavior when n=0. >> >> Sure, but it's never called with n = 0; I've double checked and the >> compiler seems to do the right thing with the above pattern as well. > >> Hmm, it looks like there is a "standard" version rotate left and >right >> defined in include/linux/bitops.h. So I suspect it would make sense >> to use rol32 as defined in bitops.h --- and this is probably >something > >bitops.h could work in this case, but its not an ideal solution. GCC >does not optimize the code below as expected under all use cases >because GCC does not recognize it as a rotate (see >http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): > > return (v << n) | (v >> (sizeof(v) * 8 - n)); > >And outside of special cases like Salsa, ChaCha and BLAKE2, the code >provided in bitops.h suffers UB on arbitrary data. So I think care >needs to be taken when selecting functions from bitops.h. > >> that we should do for the rest of crypto/*.c, where people seem to be >> defininig their own version of something like rotl32 (I copied the >> contents of crypto/chacha20_generic.c to lib/chacha20, so this >pattern >> of defining one's own version of rol32 isn't new). > >Yeah, I kind of thought there was some duplication going on. > >But I think bitops.h should be fixed. Many folks don't realize the >lurking UB, and many folks don't realize its not always optimized >well. > >>> I think the portable way to do a rotate that avoids UB is the >>> following. GCC, Clang and ICC recognize the pattern, and emit a >rotate >>> instruction. >>> >>> static const unsigned int MASK=31; >>> return (v<<n)|(v>>(-n&MASK)); >>> >>> You should also avoid the following because its not constant time >due >>> to the branch: >>> >>> return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n)); >>> >> >> Where is this coming from? I don't see this construct in the patch. > >My bad... It was a general observation. I've seen folks try to correct >the UB by turning to something like that. > >Jeff We don't care about UB, we care about gcc, and to a lesser extent LLVM and ICC. If bitops.h doesn't do the right thing, we need to fix bitops.h.
On Wed, May 04, 2016 at 11:29:57AM -0700, H. Peter Anvin wrote: > > We don't care about UB, we care about gcc, and to a lesser extent > LLVM and ICC. If bitops.h doesn't do the right thing, we need to > fix bitops.h. I'm going to suggest that we treat the ro[rl]{32,64}() question as separable from the /dev/random case. I've sanity checked gcc 5.3.1 and it does the right thing given the small number of constant arguments given to rotl32() in lib/chacha20.c, and it doesn't hit the UB case which Jeffrey was concerned about. This is also code that was previously in crypto/chacha20_generic.c, and so if there is a bug with some obscure compiler not properly compiling it down to a rotate instruction, (a) no one is paying me to make sure the kernel code compiles well on ICC, and (b) it's not scalable to have each kernel developer try to deal with the vagrancies of compilers. So from my perspective, the only interesting results for me is (a) using what had been used before with crypto/chacha20_generic.c, or (b) reusing what is in bitops.h and letting it be someone else's problem if some obscure compiler isn't happy with what is in bitops.h If we are all agreed that what is in bitops.h is considered valid, then we can start converting people over to using the version defined in bitops.h, and if there is some compiler issue we need to work around, at least we only need to put the workaround in a single header file. Cheers, - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/04/2016 12:07 PM, tytso@thunk.org wrote: > > If we are all agreed that what is in bitops.h is considered valid, > then we can start converting people over to using the version defined > in bitops.h, and if there is some compiler issue we need to work > around, at least we only need to put the workaround in a single header > file. > Yes, please. -hpa -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c index da9c899..1cab831 100644 --- a/crypto/chacha20_generic.c +++ b/crypto/chacha20_generic.c @@ -15,72 +15,11 @@ #include <linux/module.h> #include <crypto/chacha20.h> -static inline u32 rotl32(u32 v, u8 n) -{ - return (v << n) | (v >> (sizeof(v) * 8 - n)); -} - static inline u32 le32_to_cpuvp(const void *p) { return le32_to_cpup(p); } -static void chacha20_block(u32 *state, void *stream) -{ - u32 x[16], *out = stream; - int i; - - for (i = 0; i < ARRAY_SIZE(x); i++) - x[i] = state[i]; - - for (i = 0; i < 20; i += 2) { - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); - - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); - - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); - - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); - - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); - - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); - - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); - - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); - } - - for (i = 0; i < ARRAY_SIZE(x); i++) - out[i] = cpu_to_le32(x[i] + state[i]); - - state[12]++; -} - static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { diff --git a/drivers/char/random.c b/drivers/char/random.c index b583e53..95f4451 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -260,6 +260,7 @@ #include <linux/irq.h> #include <linux/syscalls.h> #include <linux/completion.h> +#include <crypto/chacha20.h> #include <asm/processor.h> #include <asm/uaccess.h> @@ -412,6 +413,15 @@ static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); static LIST_HEAD(random_ready_list); +/* + * crng_init = 0 --> Uninitialized + * 2 --> Initialized + * 3 --> Initialized from input_pool + */ +static int crng_init = 0; +#define crng_ready() (likely(crng_init >= 2)) +static void process_random_ready_list(void); + /********************************************************************** * * OS independent entropy store. Here are the functions which handle @@ -441,10 +451,13 @@ struct entropy_store { __u8 last_data[EXTRACT_SIZE]; }; +static ssize_t extract_entropy(struct entropy_store *r, void *buf, + size_t nbytes, int min, int rsvd); + +static int crng_reseed(struct entropy_store *r); static void push_to_pool(struct work_struct *work); static __u32 input_pool_data[INPUT_POOL_WORDS]; static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS]; static struct entropy_store input_pool = { .poolinfo = &poolinfo_table[0], @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = { push_to_pool), }; -static struct entropy_store nonblocking_pool = { - .poolinfo = &poolinfo_table[1], - .name = "nonblocking", - .pull = &input_pool, - .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock), - .pool = nonblocking_pool_data, - .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work, - push_to_pool), -}; - static __u32 const twist_table[8] = { 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; @@ -677,12 +680,6 @@ retry: if (!r->initialized && r->entropy_total > 128) { r->initialized = 1; r->entropy_total = 0; - if (r == &nonblocking_pool) { - prandom_reseed_late(); - process_random_ready_list(); - wake_up_all(&urandom_init_wait); - pr_notice("random: %s pool is initialized\n", r->name); - } } trace_credit_entropy_bits(r->name, nbits, @@ -692,30 +689,27 @@ retry: if (r == &input_pool) { int entropy_bits = entropy_count >> ENTROPY_SHIFT; + if (crng_init < 3 && entropy_bits >= 128) { + (void) crng_reseed(r); + entropy_bits = r->entropy_count >> ENTROPY_SHIFT; + } + /* should we wake readers? */ if (entropy_bits >= random_read_wakeup_bits) { wake_up_interruptible(&random_read_wait); kill_fasync(&fasync, SIGIO, POLL_IN); } /* If the input pool is getting full, send some - * entropy to the two output pools, flipping back and - * forth between them, until the output pools are 75% - * full. + * entropy to the blocking pool until it is 75% full. */ if (entropy_bits > random_write_wakeup_bits && r->initialized && r->entropy_total >= 2*random_read_wakeup_bits) { - static struct entropy_store *last = &blocking_pool; struct entropy_store *other = &blocking_pool; - if (last == &blocking_pool) - other = &nonblocking_pool; if (other->entropy_count <= - 3 * other->poolinfo->poolfracbits / 4) - last = other; - if (last->entropy_count <= - 3 * last->poolinfo->poolfracbits / 4) { - schedule_work(&last->push_work); + 3 * other->poolinfo->poolfracbits / 4) { + schedule_work(&other->push_work); r->entropy_total = 0; } } @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct entropy_store *r, int nbits) /********************************************************************* * + * CRNG using CHACHA20 + * + *********************************************************************/ + +#define CRNG_RESEED_INTERVAL (300*HZ) + +struct crng_state { + __u32 state[16]; + unsigned long init_time; + spinlock_t lock; +}; + +struct crng_state primary_crng = { + .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock), +}; +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); + +static void _initialize_crng(struct crng_state *crng) +{ + int i; + unsigned long rv; + + memcpy(&crng->state[0], "expand 32-byte k", 16); + for (i = 4; i < 16; i++) { + if (!arch_get_random_seed_long(&rv) && + !arch_get_random_long(&rv)) + rv = random_get_entropy(); + crng->state[i] ^= rv; + } + crng->init_time = jiffies - CRNG_RESEED_INTERVAL; +} + +static void initialize_crng(struct crng_state *crng) +{ + _initialize_crng(crng); + spin_lock_init(&crng->lock); +} + +static int crng_fast_load(__u32 pool[4]) +{ + int i; + __u32 *p; + + if (!spin_trylock(&primary_crng.lock)) + return 0; + if (crng_ready()) { + spin_unlock(&primary_crng.lock); + return 0; + } + p = &primary_crng.state[4]; + if (crng_init == 1) + p += 4; + for (i=0; i < 4; i++) + *p ^= pool[i]; + if (crng_init++ >= 2) + wake_up_interruptible(&crng_init_wait); + pr_notice("random: crng_init %d\n", crng_init); + spin_unlock(&primary_crng.lock); + return 1; +} + +/* Returns 1 on success */ +static int crng_reseed(struct entropy_store *r) +{ + unsigned long flags; + int ret = 0; + int i, num, num_words; + __u32 tmp[16]; + + spin_lock_irqsave(&primary_crng.lock, flags); + num = extract_entropy(r, tmp, 32, 16, 0); + if (num == 0) + goto out; + if (num < 16 || num > 32) { + WARN_ON(1); + pr_err("crng_reseed: num is %d?!?\n", num); + } + num_words = (num + 3) / 4; + for (i = 0; i < num_words; i++) + primary_crng.state[i+4] ^= tmp[i]; + primary_crng.init_time = jiffies; + if (crng_init < 3) { + crng_init = 3; + process_random_ready_list(); + wake_up_interruptible(&crng_init_wait); + pr_notice("random: crng_init 3\n"); + } + ret = 1; +out: + spin_unlock_irqrestore(&primary_crng.lock, flags); + return ret; +} + +static inline void crng_wait_ready(void) +{ + wait_event_interruptible(crng_init_wait, crng_ready()); +} + +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) +{ + unsigned long v, flags; + struct crng_state *crng = &primary_crng; + + if (crng_init > 2 && + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) + crng_reseed(&input_pool); + spin_lock_irqsave(&crng->lock, flags); + if (arch_get_random_long(&v)) + crng->state[14] ^= v; + chacha20_block(&crng->state[0], out); + if (crng->state[12] == 0) + crng->state[13]++; + spin_unlock_irqrestore(&crng->lock, flags); +} + +static ssize_t extract_crng_user(void __user *buf, size_t nbytes) +{ + ssize_t ret = 0, i; + __u8 tmp[CHACHA20_BLOCK_SIZE]; + int large_request = (nbytes > 256); + + while (nbytes) { + if (large_request && need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + schedule(); + } + + extract_crng(tmp); + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE); + if (copy_to_user(buf, tmp, i)) { + ret = -EFAULT; + break; + } + + nbytes -= i; + buf += i; + ret += i; + } + + /* Wipe data just written to memory */ + memzero_explicit(tmp, sizeof(tmp)); + + return ret; +} + + +/********************************************************************* + * * Entropy input management * *********************************************************************/ @@ -749,12 +895,12 @@ struct timer_rand_state { #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, }; /* - * Add device- or boot-specific data to the input and nonblocking - * pools to help initialize them to unique values. + * Add device- or boot-specific data to the input pool to help + * initialize it. * - * None of this adds any entropy, it is meant to avoid the - * problem of the nonblocking pool having similar initial state - * across largely identical devices. + * None of this adds any entropy; it is meant to avoid the problem of + * the entropy pool having similar initial state across largely + * identical devices. */ void add_device_randomness(const void *buf, unsigned int size) { @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned int size) _mix_pool_bytes(&input_pool, buf, size); _mix_pool_bytes(&input_pool, &time, sizeof(time)); spin_unlock_irqrestore(&input_pool.lock, flags); - - spin_lock_irqsave(&nonblocking_pool.lock, flags); - _mix_pool_bytes(&nonblocking_pool, buf, size); - _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time)); - spin_unlock_irqrestore(&nonblocking_pool.lock, flags); } EXPORT_SYMBOL(add_device_randomness); @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) sample.jiffies = jiffies; sample.cycles = random_get_entropy(); sample.num = num; - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; + r = &input_pool; mix_pool_bytes(r, &sample, sizeof(sample)); /* @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags) !time_after(now, fast_pool->last + HZ)) return; - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; + if (!crng_ready() && crng_fast_load(fast_pool->pool)) { + fast_pool->count = 0; + fast_pool->last = now; + return; + } + + r = &input_pool; if (!spin_trylock(&r->lock)) return; @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness); * *********************************************************************/ -static ssize_t extract_entropy(struct entropy_store *r, void *buf, - size_t nbytes, int min, int rsvd); - /* * This utility inline function is responsible for transferring entropy * from the primary pool to the secondary extraction pool. We make @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, */ void get_random_bytes(void *buf, int nbytes) { + __u8 tmp[CHACHA20_BLOCK_SIZE]; + #if DEBUG_RANDOM_BOOT > 0 - if (unlikely(nonblocking_pool.initialized == 0)) + if (!crng_ready()) printk(KERN_NOTICE "random: %pF get_random_bytes called " - "with %d bits of entropy available\n", - (void *) _RET_IP_, - nonblocking_pool.entropy_total); + "with crng_init = %d\n", (void *) _RET_IP_, crng_init); #endif trace_get_random_bytes(nbytes, _RET_IP_); - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); + + while (nbytes >= CHACHA20_BLOCK_SIZE) { + extract_crng(buf); + buf += CHACHA20_BLOCK_SIZE; + nbytes -= CHACHA20_BLOCK_SIZE; + } + + if (nbytes > 0) { + extract_crng(tmp); + memcpy(buf, tmp, nbytes); + memzero_explicit(tmp, nbytes); + } } EXPORT_SYMBOL(get_random_bytes); @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct random_ready_callback *rdy) unsigned long flags; int err = -EALREADY; - if (likely(nonblocking_pool.initialized)) + if (crng_ready()) return err; owner = rdy->owner; @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct random_ready_callback *rdy) return -ENOENT; spin_lock_irqsave(&random_ready_list_lock, flags); - if (nonblocking_pool.initialized) + if (crng_ready()) goto out; owner = NULL; @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes) } if (nbytes) - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); + get_random_bytes(p, nbytes); } EXPORT_SYMBOL(get_random_bytes_arch); @@ -1395,7 +1550,7 @@ static int rand_initialize(void) { init_std_data(&input_pool); init_std_data(&blocking_pool); - init_std_data(&nonblocking_pool); + _initialize_crng(&primary_crng); return 0; } early_initcall(rand_initialize); @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { int ret; - if (unlikely(nonblocking_pool.initialized == 0)) - printk_once(KERN_NOTICE "random: %s urandom read " - "with %d bits of entropy available\n", - current->comm, nonblocking_pool.entropy_total); - + crng_wait_ready(); nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3)); - ret = extract_entropy_user(&nonblocking_pool, buf, nbytes); - - trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool), - ENTROPY_BITS(&input_pool)); + ret = extract_crng_user(buf, nbytes); + trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool)); return ret; } @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const char __user *buffer, { size_t ret; - ret = write_pool(&blocking_pool, buffer, count); - if (ret) - return ret; - ret = write_pool(&nonblocking_pool, buffer, count); + ret = write_pool(&input_pool, buffer, count); if (ret) return ret; @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; input_pool.entropy_count = 0; - nonblocking_pool.entropy_count = 0; blocking_pool.entropy_count = 0; return 0; default: @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count, if (flags & GRND_RANDOM) return _random_read(flags & GRND_NONBLOCK, buf, count); - if (unlikely(nonblocking_pool.initialized == 0)) { + if (!crng_ready()) { if (flags & GRND_NONBLOCK) return -EAGAIN; - wait_event_interruptible(urandom_init_wait, - nonblocking_pool.initialized); + crng_wait_ready(); if (signal_pending(current)) return -ERESTARTSYS; } diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index 274bbae..20d20f68 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -16,6 +16,7 @@ struct chacha20_ctx { u32 key[8]; }; +void chacha20_block(u32 *state, void *stream); void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keysize); diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd4..9ba27cd 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ - sha1.o md5.o irq_regs.o argv_split.o \ + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o diff --git a/lib/chacha20.c b/lib/chacha20.c new file mode 100644 index 0000000..250ceed --- /dev/null +++ b/lib/chacha20.c @@ -0,0 +1,79 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/bitops.h> +#include <linux/cryptohash.h> +#include <asm/unaligned.h> +#include <crypto/chacha20.h> + +static inline u32 rotl32(u32 v, u8 n) +{ + return (v << n) | (v >> (sizeof(v) * 8 - n)); +} + +extern void chacha20_block(u32 *state, void *stream) +{ + u32 x[16], *out = stream; + int i; + + for (i = 0; i < ARRAY_SIZE(x); i++) + x[i] = state[i]; + + for (i = 0; i < 20; i += 2) { + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); + } + + for (i = 0; i < ARRAY_SIZE(x); i++) + out[i] = cpu_to_le32(x[i] + state[i]); + + state[12]++; +} +EXPORT_SYMBOL(chacha20_block);
The CRNG is faster, and we don't pretend to track entropy usage in the CRNG any more. Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- crypto/chacha20_generic.c | 61 ---------- drivers/char/random.c | 282 ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h | 1 + lib/Makefile | 2 +- lib/chacha20.c | 79 +++++++++++++ 5 files changed, 294 insertions(+), 131 deletions(-) create mode 100644 lib/chacha20.c