Message ID | 202003281643.02SGhOi3016886@sdf.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | None | expand |
Hi George, On Tue, Dec 10, 2019 at 07:15:55AM -0500, George Spelvin wrote: > Since these are authentication keys, stored in the kernel as long > as they're important, get_random_u64 is fine. In particular, > get_random_bytes has significant per-call overhead, so five > separate calls is painful. As I am unaware, how does the cost of get_random_bytes() compare to the cost of get_random_u64()? > This ended up being a more extensive change, since the previous > code was unrolled and 10 calls to get_random_u64() seems excessive. > So the code was rearranged to have smaller object size. It's not really "unrolled", but rather "not a loop", so I'd prefer to not artifially make it look like one. Could you please quantify the size difference when going from get_random_bytes() to get_random_u64(), if that is excessive enough to warrant changing the structure of the code? Otherwise please leave the structure as-is as given it is much easier to reason about -- suggestion below on how to do that neatly. > Currently fields[i] = { 1 << i, 16 * i } for all i could be computed > rather than looked up, but the table seemed more future-proof. > > For ptrauth_keys_switch(), the MSR instructions must be unrolled and > are much faster than get_random, so although a similar flags-based > interface is possible, it's probably not worth it. > > Signed-off-by: George Spelvin <lkml@sdf.org> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > Cc: linux-arm-kernel@lists.infradead.org > --- > arch/arm64/include/asm/pointer_auth.h | 20 +++++---- > arch/arm64/kernel/pointer_auth.c | 62 +++++++++++++++------------ > 2 files changed, 46 insertions(+), 36 deletions(-) > > diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h > index 7a24bad1a58b8..b7ef71362a3ae 100644 > --- a/arch/arm64/include/asm/pointer_auth.h > +++ b/arch/arm64/include/asm/pointer_auth.h > @@ -30,17 +30,19 @@ struct ptrauth_keys { > struct ptrauth_key apga; > }; > > +static inline unsigned long ptrauth_keys_supported(void) > +{ > + return (system_supports_address_auth() ? > + PR_PAC_APIAKEY | PR_PAC_APIBKEY | > + PR_PAC_APDAKEY | PR_PAC_APDBKEY : 0) | > + (system_supports_generic_auth() ? PR_PAC_APGAKEY : 0); > +} > + > +void ptrauth_keys_generate(struct ptrauth_keys *keys, unsigned long flags); > + > static inline void ptrauth_keys_init(struct ptrauth_keys *keys) > { > - if (system_supports_address_auth()) { > - get_random_bytes(&keys->apia, sizeof(keys->apia)); > - get_random_bytes(&keys->apib, sizeof(keys->apib)); > - get_random_bytes(&keys->apda, sizeof(keys->apda)); > - get_random_bytes(&keys->apdb, sizeof(keys->apdb)); > - } > - > - if (system_supports_generic_auth()) > - get_random_bytes(&keys->apga, sizeof(keys->apga)); > + ptrauth_keys_generate(keys, ptrauth_keys_supported()); > } > > #define __ptrauth_key_install(k, v) \ > diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c > index c507b584259d0..1604ed246128c 100644 > --- a/arch/arm64/kernel/pointer_auth.c > +++ b/arch/arm64/kernel/pointer_auth.c > @@ -7,40 +7,48 @@ > #include <asm/cpufeature.h> > #include <asm/pointer_auth.h> > > +/* > + * Generating crypto-quality random numbers is expensive enough that > + * there's no point unrolling this. > + */ > +void ptrauth_keys_generate(struct ptrauth_keys *keys, unsigned long flags) > +{ > + size_t i; > + static const struct { > + /* > + * 8 bits is enough for now. Compiler will complain > + * if/when we need more. > + */ > + unsigned char flag, offset; > + } fields[] = { > + { PR_PAC_APIAKEY, offsetof(struct ptrauth_keys, apia) }, > + { PR_PAC_APIBKEY, offsetof(struct ptrauth_keys, apib) }, > + { PR_PAC_APDAKEY, offsetof(struct ptrauth_keys, apda) }, > + { PR_PAC_APDBKEY, offsetof(struct ptrauth_keys, apdb) }, > + { PR_PAC_APGAKEY, offsetof(struct ptrauth_keys, apga) } > + }; > + > + for (i = 0; i < ARRAY_SIZE(fields); i++) { > + if (flags & fields[i].flag) { > + struct ptrauth_key *k = (void *)keys + fields[i].offset; > + k->lo = get_random_u64(); > + k->hi = get_random_u64(); > + } > + } > +} If we must use get_random_u64() in place of get_random_bytes(), please add a per-key helper and keep the structure largely as-is: // Note: we can drop inline if there is a real size issue static inline void __ptrauth_key_init(struct ptrauth_key *key) { key->lo = get_random_u64(); key->hi = get_random_u64(); } static inline void ptrauth_keys_init(struct ptrauth_keys *keys) { if (system_supports_address_auth()) { __ptrauth_key_init(&keys->apia); __ptrauth_key_init(&keys->apib); __ptrauth_key_init(&keys->apda); __ptrauth_key_init(&keys->apdb); } if (system_supports_generic_auth()) __ptrauth_key_init(&keys->apga); ptrauth_keys_generate(keys, ptrauth_keys_supported()); } > + > int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) > { > + unsigned long supported = ptrauth_keys_supported(); > struct ptrauth_keys *keys = &tsk->thread.keys_user; > - unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | > - PR_PAC_APDAKEY | PR_PAC_APDBKEY; > - unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; > > - if (!system_supports_address_auth() && !system_supports_generic_auth()) > + if (!supported || arg & ~supported) > return -EINVAL; > > - if (!arg) { > - ptrauth_keys_init(keys); > - ptrauth_keys_switch(keys); > - return 0; > - } > - > - if (arg & ~key_mask) > - return -EINVAL; > - > - if (((arg & addr_key_mask) && !system_supports_address_auth()) || > - ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth())) > - return -EINVAL; > - > - if (arg & PR_PAC_APIAKEY) > - get_random_bytes(&keys->apia, sizeof(keys->apia)); > - if (arg & PR_PAC_APIBKEY) > - get_random_bytes(&keys->apib, sizeof(keys->apib)); > - if (arg & PR_PAC_APDAKEY) > - get_random_bytes(&keys->apda, sizeof(keys->apda)); > - if (arg & PR_PAC_APDBKEY) > - get_random_bytes(&keys->apdb, sizeof(keys->apdb)); > - if (arg & PR_PAC_APGAKEY) > - get_random_bytes(&keys->apga, sizeof(keys->apga)); > + if (!arg) > + arg = supported; ... and similarly this should only need to change to use __ptrauth_key_init(), as above. Thanks, Mark. > > + ptrauth_keys_generate(keys, arg); > ptrauth_keys_switch(keys); > > return 0; > -- > 2.26.0 >
Sorry for the delay responding; I had to re-set-up my arm64 cross-compilation environment. On Mon, Mar 30, 2020 at 11:57:45AM +0100, Mark Rutland wrote: > On Tue, Dec 10, 2019 at 07:15:55AM -0500, George Spelvin wrote: >> Since these are authentication keys, stored in the kernel as long >> as they're important, get_random_u64 is fine. In particular, >> get_random_bytes has significant per-call overhead, so five >> separate calls is painful. > > As I am unaware, how does the cost of get_random_bytes() compare to the > cost of get_random_u64()? It's approximately 8 times the cost. Because get_random_bytes() implements anti-backtracking, it's a minimum of one global lock and one ChaCha20 operation per call. Even though chacha_block_generic() returns 64 bytes, for anti-backtracking we use 32 of them to generate a new key and discard the remainder. get_random_u64() uses the exact same generator, but amortizes the cost by storing the output in a per-CPU buffer which it only has to refill every 64 bytes generated. 7/8 of the time, it's just a fetch from a per-CPU data structure. >> This ended up being a more extensive change, since the previous >> code was unrolled and 10 calls to get_random_u64() seems excessive. >> So the code was rearranged to have smaller object size. > > It's not really "unrolled", but rather "not a loop", so I'd prefer to > not artifially make it look like one. I intended that to mean "not in a loop, but could be". I guess this entire exchange is about the distinction between "could be" and "should be". ;-) Yes, I went overboard, and your proposed change below is perfectly fine with me. > Could you please quantify the size difference when going from > get_random_bytes() to get_random_u64(), if that is excessive enough to > warrant changing the structure of the code? Otherwise please leave the > structure as-is as given it is much easier to reason about -- suggestion > below on how to do that neatly. Here are the various code sizes: text data bss dec hex filename 1480 0 0 1480 5c8 arch/arm64/kernel/pointer_auth.o.old 862 0 0 862 35e arch/arm64/kernel/pointer_auth.o.new 1492 0 0 1492 5d4 arch/arm64/kernel/pointer_auth.o.new2 1560 0 0 1560 618 arch/arm64/kernel/pointer_auth.o.new3 "old" is the existing code. "new" is my restructured code. "new2" is your simple change with a __ptrauth_key_init() helper. "new3" is with the helper forced noinline. I shrank the code significantly, but deciding whether that's a net improvement is your perogative. I should mention that at the end of my patch series, I added a function (currently called get_random_nonce(), but that's subject to revision) which uses the get_random_u64 internals with the same interface as get_random_bytes(). We could postpone this whole thing until that gets a final name and merged. (BTW, somehow in my patch a "#include <linux/prctl.h>" needed in the revised <asm/pointer_auth.h> got omitted. I probably did something stupid like added it in my cross-compilation tree but didn't push it back to my main development tree. Sorry.)
On Mon, Mar 30, 2020 at 07:32:37PM +0000, George Spelvin wrote: > On Mon, Mar 30, 2020 at 11:57:45AM +0100, Mark Rutland wrote: >> As I am unaware, how does the cost of get_random_bytes() compare to the >> cost of get_random_u64()? > > It's approximately 8 times the cost. Just a expand on on a point I may have left unclear: One get_random_bytes(), for a length up to 32 bytes, is approximately 8x the one get_random_u64(). (Then it jumps to 16x for up to 96 bytes.) Since were're using *two* get_random_u64() calls to replace one get_random_bytes(), it's a 4x cost difference between the two alternative ways of generating a 128-bit key.
On Mon, Mar 30, 2020 at 07:32:37PM +0000, George Spelvin wrote: > Sorry for the delay responding; I had to re-set-up my arm64 > cross-compilation environment. > > On Mon, Mar 30, 2020 at 11:57:45AM +0100, Mark Rutland wrote: > > On Tue, Dec 10, 2019 at 07:15:55AM -0500, George Spelvin wrote: > >> Since these are authentication keys, stored in the kernel as long > >> as they're important, get_random_u64 is fine. In particular, > >> get_random_bytes has significant per-call overhead, so five > >> separate calls is painful. > > > > As I am unaware, how does the cost of get_random_bytes() compare to the > > cost of get_random_u64()? > > It's approximately 8 times the cost. > > Because get_random_bytes() implements anti-backtracking, it's a minimum > of one global lock and one ChaCha20 operation per call. Even though > chacha_block_generic() returns 64 bytes, for anti-backtracking we use > 32 of them to generate a new key and discard the remainder. > > get_random_u64() uses the exact same generator, but amortizes the cost by > storing the output in a per-CPU buffer which it only has to refill every > 64 bytes generated. 7/8 of the time, it's just a fetch from a per-CPU > data structure. I see; thanks for this explanation. It would be helpful to mention the backtracking distinction explicitly in the commit message, since it currently only alludes to it in the first sentence. It's worth noting that the key values *can* be exposed to userspace when CONFIG_CHECKPOINT_RESTORE is selected. On such kernels, a user could regenerate and read the keys an arbitrary number of times on a CPU of their choice. From my limited understanding I presume backtracking may be a concern there? > >> This ended up being a more extensive change, since the previous > >> code was unrolled and 10 calls to get_random_u64() seems excessive. > >> So the code was rearranged to have smaller object size. > > > > It's not really "unrolled", but rather "not a loop", so I'd prefer to > > not artifially make it look like one. > > I intended that to mean "not in a loop, but could be". I guess > this entire exchange is about the distinction between "could be" > and "should be". ;-) > > Yes, I went overboard, and your proposed change below is perfectly > fine with me. Great. That's what I'd prefer due to clarity of the code, and I'm not too concerned by the figures below given it only adds 12 bytes to the contemporary text size. Thanks, Mark. > > Could you please quantify the size difference when going from > > get_random_bytes() to get_random_u64(), if that is excessive enough to > > warrant changing the structure of the code? Otherwise please leave the > > structure as-is as given it is much easier to reason about -- suggestion > > below on how to do that neatly. > > Here are the various code sizes: > text data bss dec hex filename > 1480 0 0 1480 5c8 arch/arm64/kernel/pointer_auth.o.old > 862 0 0 862 35e arch/arm64/kernel/pointer_auth.o.new > 1492 0 0 1492 5d4 arch/arm64/kernel/pointer_auth.o.new2 > 1560 0 0 1560 618 arch/arm64/kernel/pointer_auth.o.new3 > > "old" is the existing code. "new" is my restructured code. > "new2" is your simple change with a __ptrauth_key_init() helper. > "new3" is with the helper forced noinline. > > I shrank the code significantly, but deciding whether that's a net > improvement is your perogative. > > I should mention that at the end of my patch series, I added a function > (currently called get_random_nonce(), but that's subject to revision) > which uses the get_random_u64 internals with the same interface as > get_random_bytes(). We could postpone this whole thing until that gets > a final name and merged. > > > (BTW, somehow in my patch a "#include <linux/prctl.h>" needed in the > revised <asm/pointer_auth.h> got omitted. I probably did something stupid > like added it in my cross-compilation tree but didn't push it back to my > main development tree. Sorry.)
On Tue, Mar 31, 2020 at 11:14:12AM +0100, Mark Rutland wrote: > On Mon, Mar 30, 2020 at 07:32:37PM +0000, George Spelvin wrote: >> On Mon, Mar 30, 2020 at 11:57:45AM +0100, Mark Rutland wrote: >>> As I am unaware, how does the cost of get_random_bytes() compare to the >>> cost of get_random_u64()? >> >> It's approximately 8 times the cost. [Of *one* get_random_u64() >> call; 4x the cost of the two needed to generate a 128-bit key.] >> >> Because get_random_bytes() implements anti-backtracking, it's a minimum >> of one global lock and one ChaCha20 operation per call. Even though >> chacha_block_generic() returns 64 bytes, for anti-backtracking we use >> 32 of them to generate a new key and discard the remainder. >> >> get_random_u64() uses the exact same generator, but amortizes the cost by >> storing the output in a per-CPU buffer which it only has to refill every >> 64 bytes generated. 7/8 of the time, it's just a fetch from a per-CPU >> data structure. > > I see; thanks for this explanation. It would be helpful to mention the > backtracking distinction explicitly in the commit message, since it > currently only alludes to it in the first sentence. Easily done, but I need to find a centralized place to say it, or I'd be repeating myself a *lot* in the series. That said, thanks for prompting me to quantify the cost ratio. I knew it, but never actually wrote it down. > It's worth noting that the key values *can* be exposed to userspace when > CONFIG_CHECKPOINT_RESTORE is selected. On such kernels, a user could > regenerate and read the keys an arbitrary number of times on a CPU of > their choice. From my limited understanding I presume backtracking may > be a concern there? No. Backtracking is an issue if the keys must remain secret *after* they are wiped from kernel memory. This applies to session *encryption* keys (assuming the plaintext should remain secret after the session is over), and to any long-lived keys which are stored encrypted or otherwise inaccessible (e.g. in dedicated hardware). The latter includes most asymmetric private keys. Since the pointer authentication keys are authentication keys, and valueless to an attacker once the kernel is done using them, there is no need for backtracking protetion. Basically, do you need to wipe the key (with memzero_explicit) when you are done with it? If that is important, you also want to know that the key cannot be reconstructed from the CRNG state. >> Yes, I went overboard, and your proposed change below is perfectly >> fine with me. > > Great. That's what I'd prefer due to clarity of the code, and I'm not > too concerned by the figures below given it only adds 12 bytes to the > contemporary text size. A modified patch will follow. Thanks for your patience.
On Tue, Mar 31, 2020 at 02:49:15PM +0000, George Spelvin wrote: > On Tue, Mar 31, 2020 at 11:14:12AM +0100, Mark Rutland wrote: > > On Mon, Mar 30, 2020 at 07:32:37PM +0000, George Spelvin wrote: > >> On Mon, Mar 30, 2020 at 11:57:45AM +0100, Mark Rutland wrote: > >> Because get_random_bytes() implements anti-backtracking, it's a minimum > >> of one global lock and one ChaCha20 operation per call. Even though > >> chacha_block_generic() returns 64 bytes, for anti-backtracking we use > >> 32 of them to generate a new key and discard the remainder. > >> > >> get_random_u64() uses the exact same generator, but amortizes the cost by > >> storing the output in a per-CPU buffer which it only has to refill every > >> 64 bytes generated. 7/8 of the time, it's just a fetch from a per-CPU > >> data structure. > > > > I see; thanks for this explanation. It would be helpful to mention the > > backtracking distinction explicitly in the commit message, since it > > currently only alludes to it in the first sentence. > > Easily done, but I need to find a centralized place to say it, or > I'd be repeating myself a *lot* in the series. Sure, but in the interests of optimizing for review, it's worth doing a copy+paste of the key detail into each commit. That way, even if the reviewer only looks at the patch in isolation they have all the necessary context, and you don't have to reply to the same question on each patch. > > It's worth noting that the key values *can* be exposed to userspace when > > CONFIG_CHECKPOINT_RESTORE is selected. On such kernels, a user could > > regenerate and read the keys an arbitrary number of times on a CPU of > > their choice. From my limited understanding I presume backtracking may > > be a concern there? > > No. Backtracking is an issue if the keys must remain secret *after* > they are wiped from kernel memory. This applies to session > *encryption* keys (assuming the plaintext should remain secret > after the session is over), and to any long-lived keys which are > stored encrypted or otherwise inaccessible (e.g. in dedicated > hardware). The latter includes most asymmetric private keys. > Basically, do you need to wipe the key (with memzero_explicit) when > you are done with it? If that is important, you also want to > know that the key cannot be reconstructed from the CRNG state. I see, thanks for the explanation. I had misunderstood the what backtracking was in this context. > A modified patch will follow. Thanks for your patience. I've given that an Ack as it looked sound to me. Thanks, Mark.
diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h index 7a24bad1a58b8..b7ef71362a3ae 100644 --- a/arch/arm64/include/asm/pointer_auth.h +++ b/arch/arm64/include/asm/pointer_auth.h @@ -30,17 +30,19 @@ struct ptrauth_keys { struct ptrauth_key apga; }; +static inline unsigned long ptrauth_keys_supported(void) +{ + return (system_supports_address_auth() ? + PR_PAC_APIAKEY | PR_PAC_APIBKEY | + PR_PAC_APDAKEY | PR_PAC_APDBKEY : 0) | + (system_supports_generic_auth() ? PR_PAC_APGAKEY : 0); +} + +void ptrauth_keys_generate(struct ptrauth_keys *keys, unsigned long flags); + static inline void ptrauth_keys_init(struct ptrauth_keys *keys) { - if (system_supports_address_auth()) { - get_random_bytes(&keys->apia, sizeof(keys->apia)); - get_random_bytes(&keys->apib, sizeof(keys->apib)); - get_random_bytes(&keys->apda, sizeof(keys->apda)); - get_random_bytes(&keys->apdb, sizeof(keys->apdb)); - } - - if (system_supports_generic_auth()) - get_random_bytes(&keys->apga, sizeof(keys->apga)); + ptrauth_keys_generate(keys, ptrauth_keys_supported()); } #define __ptrauth_key_install(k, v) \ diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c index c507b584259d0..1604ed246128c 100644 --- a/arch/arm64/kernel/pointer_auth.c +++ b/arch/arm64/kernel/pointer_auth.c @@ -7,40 +7,48 @@ #include <asm/cpufeature.h> #include <asm/pointer_auth.h> +/* + * Generating crypto-quality random numbers is expensive enough that + * there's no point unrolling this. + */ +void ptrauth_keys_generate(struct ptrauth_keys *keys, unsigned long flags) +{ + size_t i; + static const struct { + /* + * 8 bits is enough for now. Compiler will complain + * if/when we need more. + */ + unsigned char flag, offset; + } fields[] = { + { PR_PAC_APIAKEY, offsetof(struct ptrauth_keys, apia) }, + { PR_PAC_APIBKEY, offsetof(struct ptrauth_keys, apib) }, + { PR_PAC_APDAKEY, offsetof(struct ptrauth_keys, apda) }, + { PR_PAC_APDBKEY, offsetof(struct ptrauth_keys, apdb) }, + { PR_PAC_APGAKEY, offsetof(struct ptrauth_keys, apga) } + }; + + for (i = 0; i < ARRAY_SIZE(fields); i++) { + if (flags & fields[i].flag) { + struct ptrauth_key *k = (void *)keys + fields[i].offset; + k->lo = get_random_u64(); + k->hi = get_random_u64(); + } + } +} + int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) { + unsigned long supported = ptrauth_keys_supported(); struct ptrauth_keys *keys = &tsk->thread.keys_user; - unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | - PR_PAC_APDAKEY | PR_PAC_APDBKEY; - unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; - if (!system_supports_address_auth() && !system_supports_generic_auth()) + if (!supported || arg & ~supported) return -EINVAL; - if (!arg) { - ptrauth_keys_init(keys); - ptrauth_keys_switch(keys); - return 0; - } - - if (arg & ~key_mask) - return -EINVAL; - - if (((arg & addr_key_mask) && !system_supports_address_auth()) || - ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth())) - return -EINVAL; - - if (arg & PR_PAC_APIAKEY) - get_random_bytes(&keys->apia, sizeof(keys->apia)); - if (arg & PR_PAC_APIBKEY) - get_random_bytes(&keys->apib, sizeof(keys->apib)); - if (arg & PR_PAC_APDAKEY) - get_random_bytes(&keys->apda, sizeof(keys->apda)); - if (arg & PR_PAC_APDBKEY) - get_random_bytes(&keys->apdb, sizeof(keys->apdb)); - if (arg & PR_PAC_APGAKEY) - get_random_bytes(&keys->apga, sizeof(keys->apga)); + if (!arg) + arg = supported; + ptrauth_keys_generate(keys, arg); ptrauth_keys_switch(keys); return 0;
Since these are authentication keys, stored in the kernel as long as they're important, get_random_u64 is fine. In particular, get_random_bytes has significant per-call overhead, so five separate calls is painful. This ended up being a more extensive change, since the previous code was unrolled and 10 calls to get_random_u64() seems excessive. So the code was rearranged to have smaller object size. Currently fields[i] = { 1 << i, 16 * i } for all i could be computed rather than looked up, but the table seemed more future-proof. For ptrauth_keys_switch(), the MSR instructions must be unrolled and are much faster than get_random, so although a similar flags-based interface is possible, it's probably not worth it. Signed-off-by: George Spelvin <lkml@sdf.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org --- arch/arm64/include/asm/pointer_auth.h | 20 +++++---- arch/arm64/kernel/pointer_auth.c | 62 +++++++++++++++------------ 2 files changed, 46 insertions(+), 36 deletions(-)