Message ID | 1464586765-14436-3-git-send-email-tytso@mit.edu (mailing list archive) |
---|---|
State | Not Applicable |
Delegated to: | Herbert Xu |
Headers | show |
Am Montag, 30. Mai 2016, 01:39:22 schrieb Theodore Ts'o: Hi Theodore, > On a system with a 4 socket (NUMA) system where a large number of > application threads were all trying to read from /dev/urandom, this > can result in the system spending 80% of its time contending on the > global urandom spinlock. The application should have used its own > PRNG, but let's try to help it from running, lemming-like, straight > over the locking cliff. > > Reported-by: Andi Kleen <ak@linux.intel.com> > Signed-off-by: Theodore Ts'o <tytso@mit.edu> > --- > drivers/char/random.c | 57 > +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 > insertions(+), 4 deletions(-) > > diff --git a/drivers/char/random.c b/drivers/char/random.c > index 3a4d865..1778c84 100644 > --- a/drivers/char/random.c > +++ b/drivers/char/random.c > @@ -434,6 +434,8 @@ struct crng_state primary_crng = { > */ > static int crng_init = 0; > #define crng_ready() (likely(crng_init >= 2)) > +static void _extract_crng(struct crng_state *crng, > + __u8 out[CHACHA20_BLOCK_SIZE]); > static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]); > static void process_random_ready_list(void); > > @@ -754,6 +756,17 @@ static void credit_entropy_bits_safe(struct > entropy_store *r, int nbits) > > static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); > > +#ifdef CONFIG_NUMA > +/* > + * Hack to deal with crazy userspace progams when they are all trying > + * to access /dev/urandom in parallel. The programs are almost > + * certainly doing something terribly wrong, but we'll work around > + * their brain damage. > + */ > +static struct crng_state **crng_node_pool __read_mostly; > +#endif > + > + > static void _initialize_crng(struct crng_state *crng) > { > int i; > @@ -774,11 +787,13 @@ static void _initialize_crng(struct crng_state *crng) > crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; > } > > +#ifdef CONFIG_NUMA > static void initialize_crng(struct crng_state *crng) > { > _initialize_crng(crng); > spin_lock_init(&crng->lock); > } > +#endif > > static int crng_fast_load(__u32 pool[4]) > { > @@ -818,7 +833,7 @@ static void crng_reseed(struct crng_state *crng, struct > entropy_store *r) if (num == 0) > return; > } else > - extract_crng(buf.block); > + _extract_crng(&primary_crng, buf.block); > spin_lock_irqsave(&primary_crng.lock, flags); > for (i = 0; i < 8; i++) { > unsigned long rv; > @@ -838,19 +853,26 @@ static void crng_reseed(struct crng_state *crng, > struct entropy_store *r) spin_unlock_irqrestore(&primary_crng.lock, flags); > } > > +static inline void maybe_reseed_primary_crng(void) > +{ > + if (crng_init > 2 && > + time_after(jiffies, primary_crng.init_time + CRNG_RESEED_INTERVAL)) > + crng_reseed(&primary_crng, &input_pool); > +} > + > static inline void crng_wait_ready(void) > { > wait_event_interruptible(crng_init_wait, crng_ready()); > } > > -static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > +static void _extract_crng(struct crng_state *crng, > + __u8 out[CHACHA20_BLOCK_SIZE]) > { > unsigned long v, flags; > - struct crng_state *crng = &primary_crng; > > if (crng_init > 2 && > time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) > - crng_reseed(crng, &input_pool); > + crng_reseed(crng, crng == &primary_crng ? &input_pool : NULL); > spin_lock_irqsave(&crng->lock, flags); > if (arch_get_random_long(&v)) > crng->state[14] ^= v; > @@ -860,6 +882,17 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > spin_unlock_irqrestore(&crng->lock, flags); > } > > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) > +{ > +#ifndef CONFIG_NUMA > + struct crng_state *crng = &primary_crng; > +#else > + struct crng_state *crng = crng_node_pool[numa_node_id()]; > +#endif > + > + _extract_crng(crng, out); > +} > + > static ssize_t extract_crng_user(void __user *buf, size_t nbytes) > { > ssize_t ret = 0, i; > @@ -1573,6 +1606,22 @@ static void init_std_data(struct entropy_store *r) > */ > static int rand_initialize(void) > { > +#ifdef CONFIG_NUMA > + int i; > + int num_nodes = num_possible_nodes(); > + struct crng_state *crng; > + > + crng_node_pool = kmalloc(num_nodes * sizeof(void *), > + GFP_KERNEL|__GFP_NOFAIL); > + > + for (i=0; i < num_nodes; i++) { > + crng = kmalloc(sizeof(struct crng_state), > + GFP_KERNEL | __GFP_NOFAIL); > + initialize_crng(crng); Could you please help me understand the logic flow here: The NUMA secondary DRNGs are initialized before the input/blocking pools and the primary DRNG. The initialization call uses get_random_bytes() for the secondary DRNGs. But since the primary DRNG is not yet initialized, where does the get_random_bytes gets its randomness from? > + crng_node_pool[i] = crng; > + > + } > +#endif > init_std_data(&input_pool); > init_std_data(&blocking_pool); > _initialize_crng(&primary_crng); Ciao Stephan -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 30, 2016 at 08:03:59AM +0200, Stephan Mueller wrote: > > static int rand_initialize(void) > > { > > +#ifdef CONFIG_NUMA > > + int i; > > + int num_nodes = num_possible_nodes(); > > + struct crng_state *crng; > > + > > + crng_node_pool = kmalloc(num_nodes * sizeof(void *), > > + GFP_KERNEL|__GFP_NOFAIL); > > + > > + for (i=0; i < num_nodes; i++) { > > + crng = kmalloc(sizeof(struct crng_state), > > + GFP_KERNEL | __GFP_NOFAIL); > > + initialize_crng(crng); > > Could you please help me understand the logic flow here: The NUMA secondary > DRNGs are initialized before the input/blocking pools and the primary DRNG. > > The initialization call uses get_random_bytes() for the secondary DRNGs. But > since the primary DRNG is not yet initialized, where does the get_random_bytes > gets its randomness from? Yeah, I screwed up. The hunk of code staring with "crng_node_pool = kmalloc(..." and the for loop afterwards should be moved to after _initialize_crng(). Serves me right for not testing CONFIG_NUMA before sending out the patches. This is *not* about adding entropy; as you've noted, this is done very early in boot up, before there has been any chance for any kind of entropy to be collected in any of the input pools. It's more of a insurance policy just in case on some platform, if it turns out that assuming a bit's worth of entropy per interrupt was hopelessly over-optimistic, at least the starting point will be different across different kernels (and maybe different boot times, but on the sorts of platforms where I'm most concerned, there may not be a real-time clock and almost certainly not a architectural hwrng in the CPU). - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/char/random.c b/drivers/char/random.c index 3a4d865..1778c84 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -434,6 +434,8 @@ struct crng_state primary_crng = { */ static int crng_init = 0; #define crng_ready() (likely(crng_init >= 2)) +static void _extract_crng(struct crng_state *crng, + __u8 out[CHACHA20_BLOCK_SIZE]); static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]); static void process_random_ready_list(void); @@ -754,6 +756,17 @@ static void credit_entropy_bits_safe(struct entropy_store *r, int nbits) static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); +#ifdef CONFIG_NUMA +/* + * Hack to deal with crazy userspace progams when they are all trying + * to access /dev/urandom in parallel. The programs are almost + * certainly doing something terribly wrong, but we'll work around + * their brain damage. + */ +static struct crng_state **crng_node_pool __read_mostly; +#endif + + static void _initialize_crng(struct crng_state *crng) { int i; @@ -774,11 +787,13 @@ static void _initialize_crng(struct crng_state *crng) crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } +#ifdef CONFIG_NUMA static void initialize_crng(struct crng_state *crng) { _initialize_crng(crng); spin_lock_init(&crng->lock); } +#endif static int crng_fast_load(__u32 pool[4]) { @@ -818,7 +833,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) if (num == 0) return; } else - extract_crng(buf.block); + _extract_crng(&primary_crng, buf.block); spin_lock_irqsave(&primary_crng.lock, flags); for (i = 0; i < 8; i++) { unsigned long rv; @@ -838,19 +853,26 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) spin_unlock_irqrestore(&primary_crng.lock, flags); } +static inline void maybe_reseed_primary_crng(void) +{ + if (crng_init > 2 && + time_after(jiffies, primary_crng.init_time + CRNG_RESEED_INTERVAL)) + crng_reseed(&primary_crng, &input_pool); +} + static inline void crng_wait_ready(void) { wait_event_interruptible(crng_init_wait, crng_ready()); } -static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) +static void _extract_crng(struct crng_state *crng, + __u8 out[CHACHA20_BLOCK_SIZE]) { unsigned long v, flags; - struct crng_state *crng = &primary_crng; if (crng_init > 2 && time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) - crng_reseed(crng, &input_pool); + crng_reseed(crng, crng == &primary_crng ? &input_pool : NULL); spin_lock_irqsave(&crng->lock, flags); if (arch_get_random_long(&v)) crng->state[14] ^= v; @@ -860,6 +882,17 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) spin_unlock_irqrestore(&crng->lock, flags); } +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) +{ +#ifndef CONFIG_NUMA + struct crng_state *crng = &primary_crng; +#else + struct crng_state *crng = crng_node_pool[numa_node_id()]; +#endif + + _extract_crng(crng, out); +} + static ssize_t extract_crng_user(void __user *buf, size_t nbytes) { ssize_t ret = 0, i; @@ -1573,6 +1606,22 @@ static void init_std_data(struct entropy_store *r) */ static int rand_initialize(void) { +#ifdef CONFIG_NUMA + int i; + int num_nodes = num_possible_nodes(); + struct crng_state *crng; + + crng_node_pool = kmalloc(num_nodes * sizeof(void *), + GFP_KERNEL|__GFP_NOFAIL); + + for (i=0; i < num_nodes; i++) { + crng = kmalloc(sizeof(struct crng_state), + GFP_KERNEL | __GFP_NOFAIL); + initialize_crng(crng); + crng_node_pool[i] = crng; + + } +#endif init_std_data(&input_pool); init_std_data(&blocking_pool); _initialize_crng(&primary_crng);
On a system with a 4 socket (NUMA) system where a large number of application threads were all trying to read from /dev/urandom, this can result in the system spending 80% of its time contending on the global urandom spinlock. The application should have used its own PRNG, but let's try to help it from running, lemming-like, straight over the locking cliff. Reported-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- drivers/char/random.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-)