Message ID | 77c0d1ec143bf2a53d66c4ecb190e7e0a576fbfd.1737511963.git.jpoimboe@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | unwind, perf: sframe user space unwinding | expand |
On 22.01.2025 03:31, Josh Poimboeuf wrote: > diff --git a/include/linux/sframe.h b/include/linux/sframe.h > @@ -3,11 +3,14 @@ > #define _LINUX_SFRAME_H > > #include <linux/mm_types.h> > +#include <linux/srcu.h> > #include <linux/unwind_user_types.h> > > #ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > > struct sframe_section { > + struct rcu_head rcu; > + Nit: You are adding a blank line, that you later remove with "[PATCH v4 25/39] unwind_user/sframe: Show file name in debug output". > unsigned long sframe_start; > unsigned long sframe_end; > unsigned long text_start; Regards, Jens
On Fri, Jan 24, 2025 at 05:36:38PM +0100, Jens Remus wrote: > On 22.01.2025 03:31, Josh Poimboeuf wrote: > > > diff --git a/include/linux/sframe.h b/include/linux/sframe.h > > > @@ -3,11 +3,14 @@ > > #define _LINUX_SFRAME_H > > #include <linux/mm_types.h> > > +#include <linux/srcu.h> > > #include <linux/unwind_user_types.h> > > #ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > > struct sframe_section { > > + struct rcu_head rcu; > > + > > Nit: You are adding a blank line, that you later remove with > "[PATCH v4 25/39] unwind_user/sframe: Show file name in debug output". I suppose that was intentional. The original blank line created visual separation between the rcu head and the sframe values. The later patch instead sort of uses the ifdef to keep some separation? But yeah, I'll keep the blank lines for consistency. <shrug> struct sframe_section { struct rcu_head rcu; #ifdef CONFIG_DYNAMIC_DEBUG const char *filename; #endif unsigned long sframe_start; unsigned long sframe_start; unsigned long sframe_end; unsigned long text_start; unsigned long text_end; unsigned long fdes_start; unsigned long fres_start; unsigned long fres_end; unsigned int num_fdes; signed char ra_off; signed char fp_off; };
On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > In preparation for using sframe to unwind user space stacks, add an > sframe_find() interface for finding the sframe information associated > with a given text address. > > For performance, use user_read_access_begin() and the corresponding > unsafe_*() accessors. Note that use of pr_debug() in uaccess-enabled > regions would break noinstr validation, so there aren't any debug > messages yet. That will be added in a subsequent commit. > > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> > --- > include/linux/sframe.h | 5 + > kernel/unwind/sframe.c | 295 ++++++++++++++++++++++++++++++++++- > kernel/unwind/sframe_debug.h | 35 +++++ > 3 files changed, 331 insertions(+), 4 deletions(-) > create mode 100644 kernel/unwind/sframe_debug.h > [...] > + > +static __always_inline int __read_fde(struct sframe_section *sec, > + unsigned int fde_num, > + struct sframe_fde *fde) > +{ > + unsigned long fde_addr, ip; > + > + fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde)); > + unsafe_copy_from_user(fde, (void __user *)fde_addr, > + sizeof(struct sframe_fde), Efault); > + > + ip = sec->sframe_start + fde->start_addr; > + if (ip < sec->text_start || ip > sec->text_end) ip >= sec->test_end ? ip == sec->test_end doesn't make sense, no? > + return -EINVAL; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + [...] > +static __always_inline int __read_fre(struct sframe_section *sec, > + struct sframe_fde *fde, > + unsigned long fre_addr, > + struct sframe_fre *fre) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > + unsigned char offset_count, offset_size; > + s32 ip_off, cfa_off, ra_off, fp_off; > + unsigned long cur = fre_addr; > + unsigned char addr_size; > + u8 info; > + > + addr_size = fre_type_to_size(fre_type); > + if (!addr_size) > + return -EFAULT; > + > + if (fre_addr + addr_size + 1 > sec->fres_end) nit: isn't this the same as `fre_addr + addr_size >= sec->fres_end` ? > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); > + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) is ip_off == fde->func_size allowable? > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > + if (!offset_count || !offset_size) > + return -EFAULT; > + > + if (cur + (offset_count * offset_size) > sec->fres_end) offset_count * offset_size done in u8 can overflow, no? maybe upcast to unsigned long or use check_add_overflow? > + return -EFAULT; > + > + fre->size = addr_size + 1 + (offset_count * offset_size); > + > + UNSAFE_GET_USER_INC(cfa_off, cur, offset_size, Efault); > + offset_count--; > + > + ra_off = sec->ra_off; > + if (!ra_off) { > + if (!offset_count--) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); > + } > + > + fp_off = sec->fp_off; > + if (!fp_off && offset_count) { > + offset_count--; > + UNSAFE_GET_USER_INC(fp_off, cur, offset_size, Efault); > + } > + > + if (offset_count) > + return -EFAULT; > + > + fre->ip_off = ip_off; > + fre->cfa_off = cfa_off; > + fre->ra_off = ra_off; > + fre->fp_off = fp_off; > + fre->info = info; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + > +static __always_inline int __find_fre(struct sframe_section *sec, > + struct sframe_fde *fde, unsigned long ip, > + struct unwind_user_frame *frame) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + struct sframe_fre *fre, *prev_fre = NULL; > + struct sframe_fre fres[2]; you only need prev_fre->ip_off, so why all this `which` and `fres[2]` business if all you need is prev_fre_off and a bool whether you have prev_fre at all? > + unsigned long fre_addr; > + bool which = false; > + unsigned int i; > + s32 ip_off; > + > + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; > + > + if (fde_type == SFRAME_FDE_TYPE_PCMASK) > + ip_off %= fde->rep_size; did you check that fde->rep_size is not zero? > + > + fre_addr = sec->fres_start + fde->fres_off; > + > + for (i = 0; i < fde->fres_num; i++) { why not binary search? seem more logical to guard against cases with lots of FREs and be pretty fast in common case anyways. > + int ret; > + > + /* > + * Alternate between the two fre_addr[] entries for 'fre' and > + * 'prev_fre'. > + */ > + fre = which ? fres : fres + 1; > + which = !which; > + > + ret = __read_fre(sec, fde, fre_addr, fre); > + if (ret) > + return ret; > + > + fre_addr += fre->size; > + > + if (prev_fre && fre->ip_off <= prev_fre->ip_off) > + return -EFAULT; > + > + if (fre->ip_off > ip_off) > + break; > + > + prev_fre = fre; > + } > + > + if (!prev_fre) > + return -EINVAL; > + fre = prev_fre; > + > + frame->cfa_off = fre->cfa_off; > + frame->ra_off = fre->ra_off; > + frame->fp_off = fre->fp_off; > + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP; > + > + return 0; > +} > + [...]
On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: > On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > +static __always_inline int __read_fde(struct sframe_section *sec, > > + unsigned int fde_num, > > + struct sframe_fde *fde) > > +{ > > + unsigned long fde_addr, ip; > > + > > + fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde)); > > + unsafe_copy_from_user(fde, (void __user *)fde_addr, > > + sizeof(struct sframe_fde), Efault); > > + > > + ip = sec->sframe_start + fde->start_addr; > > + if (ip < sec->text_start || ip > sec->text_end) > > ip >= sec->test_end ? ip == sec->test_end doesn't make sense, no? Believe it or not, this may have been intentional: 'text_end' can actually be a valid stack return address if the last instruction of the section is a call to a noreturn function. The unwinder still needs to know the state of the stack at that point. That said, there would be no reason to put an FDE at the end. So yeah, it should be '>='. > > +static __always_inline int __read_fre(struct sframe_section *sec, > > + struct sframe_fde *fde, > > + unsigned long fre_addr, > > + struct sframe_fre *fre) > > +{ > > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > > + unsigned char offset_count, offset_size; > > + s32 ip_off, cfa_off, ra_off, fp_off; > > + unsigned long cur = fre_addr; > > + unsigned char addr_size; > > + u8 info; > > + > > + addr_size = fre_type_to_size(fre_type); > > + if (!addr_size) > > + return -EFAULT; > > + > > + if (fre_addr + addr_size + 1 > sec->fres_end) > > nit: isn't this the same as `fre_addr + addr_size >= sec->fres_end` ? Yes, but that would further obfuscate the fact that this is validating that the next two reads don't go past sec->fres_end: UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); UNSAFE_GET_USER_INC(info, cur, 1, Efault); The explicit "1" is for that second read. I'll do something like SFRAME_FRE_INFO_SIZE instead of 1 to make that clearer. > > + return -EFAULT; > > + > > + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); > > + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) > > is ip_off == fde->func_size allowable? This was another one where I was probably thinking about the whole "last insn could be a call to noreturn" scenario. But even in that case, the frame would be unchanged after the call, so there shouldn't need to be an FRE there. In fact, for the unwinder to deal with that scenario, for a call frame (as opposed to an interrupted one), it should always subtract one from the return address before calling into sframe so that it points to the calling instruction. </me makes note to go do that> So yeah, this looks like another off-by-one, caused by overthinking yet not thinking enough. > > + return -EFAULT; > > + > > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); > > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > > + if (!offset_count || !offset_size) > > + return -EFAULT; > > + > > + if (cur + (offset_count * offset_size) > sec->fres_end) > > offset_count * offset_size done in u8 can overflow, no? maybe upcast > to unsigned long or use check_add_overflow? offset_size is <= 2 as returned by offset_size_enum_to_size(). offset_count is expected to be <= 3, enforced by the !offset_count check at the bottom. An overflow here would be harmless as it would be caught by the !offset_count anyway. Though I also notice offset_count isn't big enough to hold the 2-byte SFRAME_FRE_OFFSET_COUNT() value. Which is harmless for the same reason, but yeah I'll make offset_count an unsigned int. > > +static __always_inline int __find_fre(struct sframe_section *sec, > > + struct sframe_fde *fde, unsigned long ip, > > + struct unwind_user_frame *frame) > > +{ > > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > > + struct sframe_fre *fre, *prev_fre = NULL; > > + struct sframe_fre fres[2]; > > you only need prev_fre->ip_off, so why all this `which` and `fres[2]` > business if all you need is prev_fre_off and a bool whether you have > prev_fre at all? So this cleverness probably needs a comment. prev_fre is actually needed after the loop: > > + if (!prev_fre) > > + return -EINVAL; > > + fre = prev_fre; In the body of the loop, prev_fre is a tentative match, unless the next fre also matches, in which case that one becomes the new tentative match. I'll add a comment. Also it'll probably be less confusing if I rename "prev_fre" to "fre", and "fre" to "next_fre". > > + unsigned long fre_addr; > > + bool which = false; > > + unsigned int i; > > + s32 ip_off; > > + > > + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; > > + > > + if (fde_type == SFRAME_FDE_TYPE_PCMASK) > > + ip_off %= fde->rep_size; > > did you check that fde->rep_size is not zero? I did not, good catch! > > + fre_addr = sec->fres_start + fde->fres_off; > > + > > + for (i = 0; i < fde->fres_num; i++) { > > why not binary search? seem more logical to guard against cases with > lots of FREs and be pretty fast in common case anyways. That would be nice, but the FREs are variable-sized and you don't know how big one is until you start reading it.
On Fri, Jan 24, 2025 at 1:41 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: > > On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > +static __always_inline int __read_fde(struct sframe_section *sec, > > > + unsigned int fde_num, > > > + struct sframe_fde *fde) > > > +{ > > > + unsigned long fde_addr, ip; > > > + > > > + fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde)); > > > + unsafe_copy_from_user(fde, (void __user *)fde_addr, > > > + sizeof(struct sframe_fde), Efault); > > > + > > > + ip = sec->sframe_start + fde->start_addr; > > > + if (ip < sec->text_start || ip > sec->text_end) > > > > ip >= sec->test_end ? ip == sec->test_end doesn't make sense, no? > > Believe it or not, this may have been intentional: 'text_end' can > actually be a valid stack return address if the last instruction of the > section is a call to a noreturn function. The unwinder still needs to > know the state of the stack at that point. > TIL... > That said, there would be no reason to put an FDE at the end. So yeah, > it should be '>='. > > > > +static __always_inline int __read_fre(struct sframe_section *sec, > > > + struct sframe_fde *fde, > > > + unsigned long fre_addr, > > > + struct sframe_fre *fre) > > > +{ > > > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > > > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > > > + unsigned char offset_count, offset_size; > > > + s32 ip_off, cfa_off, ra_off, fp_off; > > > + unsigned long cur = fre_addr; > > > + unsigned char addr_size; > > > + u8 info; > > > + > > > + addr_size = fre_type_to_size(fre_type); > > > + if (!addr_size) > > > + return -EFAULT; > > > + > > > + if (fre_addr + addr_size + 1 > sec->fres_end) > > > > nit: isn't this the same as `fre_addr + addr_size >= sec->fres_end` ? > > Yes, but that would further obfuscate the fact that this is validating > that the next two reads don't go past sec->fres_end: > > UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); > UNSAFE_GET_USER_INC(info, cur, 1, Efault); > > The explicit "1" is for that second read. > > I'll do something like SFRAME_FRE_INFO_SIZE instead of 1 to make that > clearer. yeah, I definitely didn't connect 1 with the size of fre info byte, named #define makes sense, thanks! > > > > + return -EFAULT; > > > + > > > + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); > > > + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) > > > > is ip_off == fde->func_size allowable? > > This was another one where I was probably thinking about the whole "last > insn could be a call to noreturn" scenario. > > But even in that case, the frame would be unchanged after the call, so > there shouldn't need to be an FRE there. > > In fact, for the unwinder to deal with that scenario, for a call frame > (as opposed to an interrupted one), it should always subtract one from > the return address before calling into sframe so that it points to the > calling instruction. </me makes note to go do that> > > So yeah, this looks like another off-by-one, caused by overthinking yet > not thinking enough. > > > > + return -EFAULT; > > > + > > > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); > > > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > > > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > > > + if (!offset_count || !offset_size) > > > + return -EFAULT; > > > + > > > + if (cur + (offset_count * offset_size) > sec->fres_end) > > > > offset_count * offset_size done in u8 can overflow, no? maybe upcast > > to unsigned long or use check_add_overflow? > > offset_size is <= 2 as returned by offset_size_enum_to_size(). > > offset_count is expected to be <= 3, enforced by the !offset_count check > at the bottom. > > An overflow here would be harmless as it would be caught by the > !offset_count anyway. Though I also notice offset_count isn't big > enough to hold the 2-byte SFRAME_FRE_OFFSET_COUNT() value. Which is > harmless for the same reason, but yeah I'll make offset_count an > unsigned int. yeah, the less whoever is reading has to worry about overflows, the better, thanks! > > > > +static __always_inline int __find_fre(struct sframe_section *sec, > > > + struct sframe_fde *fde, unsigned long ip, > > > + struct unwind_user_frame *frame) > > > +{ > > > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > > > + struct sframe_fre *fre, *prev_fre = NULL; > > > + struct sframe_fre fres[2]; > > > > you only need prev_fre->ip_off, so why all this `which` and `fres[2]` > > business if all you need is prev_fre_off and a bool whether you have > > prev_fre at all? > > So this cleverness probably needs a comment. prev_fre is actually > needed after the loop: > > > > + if (!prev_fre) > > > + return -EINVAL; > > > + fre = prev_fre; > > In the body of the loop, prev_fre is a tentative match, unless the next > fre also matches, in which case that one becomes the new tentative > match. > > I'll add a comment. Also it'll probably be less confusing if I rename > "prev_fre" to "fre", and "fre" to "next_fre". ah, yeah, I missed that, you are right > > > > + unsigned long fre_addr; > > > + bool which = false; > > > + unsigned int i; > > > + s32 ip_off; > > > + > > > + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; > > > + > > > + if (fde_type == SFRAME_FDE_TYPE_PCMASK) > > > + ip_off %= fde->rep_size; > > > > did you check that fde->rep_size is not zero? > > I did not, good catch! > > > > + fre_addr = sec->fres_start + fde->fres_off; > > > + > > > + for (i = 0; i < fde->fres_num; i++) { > > > > why not binary search? seem more logical to guard against cases with > > lots of FREs and be pretty fast in common case anyways. > > That would be nice, but the FREs are variable-sized and you don't know > how big one is until you start reading it. ah, another non-obvious thing, yeah... do you think it's worth fixing this and making FREs binary searchable in v3? Indu, do you have some stats on distribution of FRE count per FDE in practice? Tbh, FRE format is bothering me quite a lot... but let's discuss that in another thread with you and Indu > > -- > Josh
On 28.01.2025 01:39, Andrii Nakryiko wrote: > On Fri, Jan 24, 2025 at 1:41 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >> On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: >>> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >>>> + UNSAFE_GET_USER_INC(info, cur, 1, Efault); >>>> + offset_count = SFRAME_FRE_OFFSET_COUNT(info); >>>> + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); >>>> + if (!offset_count || !offset_size) >>>> + return -EFAULT; >>>> + >>>> + if (cur + (offset_count * offset_size) > sec->fres_end) >>> >>> offset_count * offset_size done in u8 can overflow, no? maybe upcast >>> to unsigned long or use check_add_overflow? The maximum offset_count * offset_size is 15 * 4 = 60 if I am not wrong: >> offset_size is <= 2 as returned by offset_size_enum_to_size(). SFrame V2 FRE offset sizes are either 1, 2, or 4 bytes. This is also reflected in offset_size_enum_to_size(). >> offset_count is expected to be <= 3, enforced by the !offset_count check >> at the bottom. SFrame V2 FRE offset count is 4 bits unsigned, so 0 <= offset_count <= 15. >> An overflow here would be harmless as it would be caught by the >> !offset_count anyway. Though I also notice offset_count isn't big >> enough to hold the 2-byte SFRAME_FRE_OFFSET_COUNT() value. Which is >> harmless for the same reason, but yeah I'll make offset_count an >> unsigned int. As mentioned above the FRE offset count is 4 bits, not 2 bytes. This is also reflected in SFRAME_FRE_OFFSET_COUNT(). Regards, Jens
On 28.01.2025 01:39, Andrii Nakryiko wrote: > On Fri, Jan 24, 2025 at 1:41 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >> On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: >>> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >>>> + fre_addr = sec->fres_start + fde->fres_off; >>>> + >>>> + for (i = 0; i < fde->fres_num; i++) { >>> >>> why not binary search? seem more logical to guard against cases with >>> lots of FREs and be pretty fast in common case anyways. >> >> That would be nice, but the FREs are variable-sized and you don't know >> how big one is until you start reading it. > > ah, another non-obvious thing, yeah... do you think it's worth fixing > this and making FREs binary searchable in v3? > > Indu, do you have some stats on distribution of FRE count per FDE in practice? > > Tbh, FRE format is bothering me quite a lot... but let's discuss that > in another thread with you and Indu I would be interested to be part of that discussion. I could share some preliminary stats from my s390 work. Thanks and regards, Jens
On Tue, Jan 28, 2025 at 11:50:25AM +0100, Jens Remus wrote: > On 28.01.2025 01:39, Andrii Nakryiko wrote: > > On Fri, Jan 24, 2025 at 1:41 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: > > > > On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > > > > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); > > > > > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > > > > > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > > > > > + if (!offset_count || !offset_size) > > > > > + return -EFAULT; > > > > > + > > > > > + if (cur + (offset_count * offset_size) > sec->fres_end) > > > > > > > > offset_count * offset_size done in u8 can overflow, no? maybe upcast > > > > to unsigned long or use check_add_overflow? > > The maximum offset_count * offset_size is 15 * 4 = 60 if I am not wrong: > > > > offset_size is <= 2 as returned by offset_size_enum_to_size(). > > SFrame V2 FRE offset sizes are either 1, 2, or 4 bytes. This is also > reflected in offset_size_enum_to_size(). > > > > offset_count is expected to be <= 3, enforced by the !offset_count check > > > at the bottom. > > SFrame V2 FRE offset count is 4 bits unsigned, so 0 <= offset_count <= 15. > > > An overflow here would be harmless as it would be caught by the > > > !offset_count anyway. Though I also notice offset_count isn't big > > > enough to hold the 2-byte SFRAME_FRE_OFFSET_COUNT() value. Which is > > > harmless for the same reason, but yeah I'll make offset_count an > > > unsigned int. > > As mentioned above the FRE offset count is 4 bits, not 2 bytes. This is > also reflected in SFRAME_FRE_OFFSET_COUNT(). You are right on both counts, not sure what I was smoking that day.
On 1/21/25 6:31 PM, Josh Poimboeuf wrote: > In preparation for using sframe to unwind user space stacks, add an > sframe_find() interface for finding the sframe information associated > with a given text address. > > For performance, use user_read_access_begin() and the corresponding > unsafe_*() accessors. Note that use of pr_debug() in uaccess-enabled > regions would break noinstr validation, so there aren't any debug > messages yet. That will be added in a subsequent commit. > > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> > --- > include/linux/sframe.h | 5 + > kernel/unwind/sframe.c | 295 ++++++++++++++++++++++++++++++++++- > kernel/unwind/sframe_debug.h | 35 +++++ > 3 files changed, 331 insertions(+), 4 deletions(-) > create mode 100644 kernel/unwind/sframe_debug.h > > diff --git a/include/linux/sframe.h b/include/linux/sframe.h > index ff4b9d1dbd00..2e70085a1e89 100644 > --- a/include/linux/sframe.h > +++ b/include/linux/sframe.h > @@ -3,11 +3,14 @@ > #define _LINUX_SFRAME_H > > #include <linux/mm_types.h> > +#include <linux/srcu.h> > #include <linux/unwind_user_types.h> > > #ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > > struct sframe_section { > + struct rcu_head rcu; > + > unsigned long sframe_start; > unsigned long sframe_end; > unsigned long text_start; > @@ -28,6 +31,7 @@ extern void sframe_free_mm(struct mm_struct *mm); > extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, > unsigned long text_start, unsigned long text_end); > extern int sframe_remove_section(unsigned long sframe_addr); > +extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame); > > static inline bool current_has_sframe(void) > { > @@ -42,6 +46,7 @@ static inline bool current_has_sframe(void) > static inline void sframe_free_mm(struct mm_struct *mm) {} > static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; } > static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; } > +static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -ENOSYS; } > static inline bool current_has_sframe(void) { return false; } > > #endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */ > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > index fa7d87ffd00a..1a35615a361e 100644 > --- a/kernel/unwind/sframe.c > +++ b/kernel/unwind/sframe.c > @@ -15,9 +15,287 @@ > #include <linux/unwind_user_types.h> > > #include "sframe.h" > +#include "sframe_debug.h" > > -#define dbg(fmt, ...) \ > - pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__) > +struct sframe_fre { > + unsigned int size; > + s32 ip_off; > + s32 cfa_off; > + s32 ra_off; > + s32 fp_off; > + u8 info; > +}; > + > +DEFINE_STATIC_SRCU(sframe_srcu); > + > +static __always_inline unsigned char fre_type_to_size(unsigned char fre_type) > +{ > + if (fre_type > 2) > + return 0; > + return 1 << fre_type; > +} > + > +static __always_inline unsigned char offset_size_enum_to_size(unsigned char off_size) > +{ > + if (off_size > 2) > + return 0; > + return 1 << off_size; > +} > + > +static __always_inline int __read_fde(struct sframe_section *sec, > + unsigned int fde_num, > + struct sframe_fde *fde) > +{ > + unsigned long fde_addr, ip; > + > + fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde)); > + unsafe_copy_from_user(fde, (void __user *)fde_addr, > + sizeof(struct sframe_fde), Efault); > + > + ip = sec->sframe_start + fde->start_addr; > + if (ip < sec->text_start || ip > sec->text_end) > + return -EINVAL; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + > +static __always_inline int __find_fde(struct sframe_section *sec, > + unsigned long ip, > + struct sframe_fde *fde) > +{ > + s32 ip_off, func_off_low = S32_MIN, func_off_high = S32_MAX; > + struct sframe_fde __user *first, *low, *high, *found = NULL; > + int ret; > + > + ip_off = ip - sec->sframe_start; > + > + first = (void __user *)sec->fdes_start; > + low = first; > + high = first + sec->num_fdes - 1; > + > + while (low <= high) { > + struct sframe_fde __user *mid; > + s32 func_off; > + > + mid = low + ((high - low) / 2); > + > + unsafe_get_user(func_off, (s32 __user *)mid, Efault); > + > + if (ip_off >= func_off) { > + if (func_off < func_off_low) > + return -EFAULT; > + > + func_off_low = func_off; > + > + found = mid; > + low = mid + 1; > + } else { > + if (func_off > func_off_high) > + return -EFAULT; > + > + func_off_high = func_off; > + > + high = mid - 1; > + } > + } > + > + if (!found) > + return -EINVAL; > + > + ret = __read_fde(sec, found - first, fde); > + if (ret) > + return ret; > + > + /* make sure it's not in a gap */ > + if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->func_size) > + return -EINVAL; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + > +#define __UNSAFE_GET_USER_INC(to, from, type, label) \ > +({ \ > + type __to; \ > + unsafe_get_user(__to, (type __user *)from, label); \ > + from += sizeof(__to); \ > + to = (typeof(to))__to; \ > +}) > + > +#define UNSAFE_GET_USER_INC(to, from, size, label) \ > +({ \ > + switch (size) { \ > + case 1: \ > + __UNSAFE_GET_USER_INC(to, from, u8, label); \ > + break; \ > + case 2: \ > + __UNSAFE_GET_USER_INC(to, from, u16, label); \ > + break; \ > + case 4: \ > + __UNSAFE_GET_USER_INC(to, from, u32, label); \ > + break; \ > + default: \ > + return -EFAULT; \ > + } \ > +}) > + > +static __always_inline int __read_fre(struct sframe_section *sec, > + struct sframe_fde *fde, > + unsigned long fre_addr, > + struct sframe_fre *fre) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > + unsigned char offset_count, offset_size; > + s32 ip_off, cfa_off, ra_off, fp_off; > + unsigned long cur = fre_addr; > + unsigned char addr_size; > + u8 info; > + > + addr_size = fre_type_to_size(fre_type); > + if (!addr_size) > + return -EFAULT; > + > + if (fre_addr + addr_size + 1 > sec->fres_end) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); > + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > + if (!offset_count || !offset_size) > + return -EFAULT; > + > + if (cur + (offset_count * offset_size) > sec->fres_end) > + return -EFAULT; > + > + fre->size = addr_size + 1 + (offset_count * offset_size); > + > + UNSAFE_GET_USER_INC(cfa_off, cur, offset_size, Efault); > + offset_count--; > + > + ra_off = sec->ra_off; > + if (!ra_off) { > + if (!offset_count--) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); > + } > + > + fp_off = sec->fp_off; > + if (!fp_off && offset_count) { > + offset_count--; > + UNSAFE_GET_USER_INC(fp_off, cur, offset_size, Efault); > + } > + > + if (offset_count) > + return -EFAULT; > + > + fre->ip_off = ip_off; > + fre->cfa_off = cfa_off; > + fre->ra_off = ra_off; > + fre->fp_off = fp_off; > + fre->info = info; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + > +static __always_inline int __find_fre(struct sframe_section *sec, > + struct sframe_fde *fde, unsigned long ip, > + struct unwind_user_frame *frame) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + struct sframe_fre *fre, *prev_fre = NULL; > + struct sframe_fre fres[2]; > + unsigned long fre_addr; > + bool which = false; > + unsigned int i; > + s32 ip_off; > + > + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; > + > + if (fde_type == SFRAME_FDE_TYPE_PCMASK) > + ip_off %= fde->rep_size; > + > + fre_addr = sec->fres_start + fde->fres_off; > + > + for (i = 0; i < fde->fres_num; i++) { > + int ret; > + > + /* > + * Alternate between the two fre_addr[] entries for 'fre' and > + * 'prev_fre'. > + */ > + fre = which ? fres : fres + 1; > + which = !which; > + > + ret = __read_fre(sec, fde, fre_addr, fre); > + if (ret) > + return ret; > + It should be possible to only read the ip_off and info from FRE and defer the reading of offsets (as done in __read_fre) until later when you do need the offsets. See below. We can find the relevant FRE with the following pieces of information: - ip_off - fre_size (this will mean we need to read the uin8_t info in the FRE) > + fre_addr += fre->size; > + > + if (prev_fre && fre->ip_off <= prev_fre->ip_off) > + return -EFAULT; > + > + if (fre->ip_off > ip_off) > + break; > + > + prev_fre = fre; > + } > + > + if (!prev_fre) > + return -EINVAL; > + fre = prev_fre; > + (Invoke the __read_fre here as we know now that this FRE is what we are looking for.) > + frame->cfa_off = fre->cfa_off; > + frame->ra_off = fre->ra_off; > + frame->fp_off = fre->fp_off; > + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP; > + > + return 0; > +}
On 22.01.2025 03:31, Josh Poimboeuf wrote: > In preparation for using sframe to unwind user space stacks, add an > sframe_find() interface for finding the sframe information associated > with a given text address. > > For performance, use user_read_access_begin() and the corresponding > unsafe_*() accessors. Note that use of pr_debug() in uaccess-enabled > regions would break noinstr validation, so there aren't any debug > messages yet. That will be added in a subsequent commit. > > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > +struct sframe_fre { > + unsigned int size; > + s32 ip_off; The IP offset (from function start) in the SFrame V2 FDE is unsigned: u32 ip_off; > + s32 cfa_off; > + s32 ra_off; > + s32 fp_off; > + u8 info; > +}; ... > +#define __UNSAFE_GET_USER_INC(to, from, type, label) \ > +({ \ > + type __to; \ > + unsafe_get_user(__to, (type __user *)from, label); \ > + from += sizeof(__to); \ > + to = (typeof(to))__to; \ > +}) > + > +#define UNSAFE_GET_USER_INC(to, from, size, label) \ > +({ \ > + switch (size) { \ > + case 1: \ > + __UNSAFE_GET_USER_INC(to, from, u8, label); \ > + break; \ > + case 2: \ > + __UNSAFE_GET_USER_INC(to, from, u16, label); \ > + break; \ > + case 4: \ > + __UNSAFE_GET_USER_INC(to, from, u32, label); \ > + break; \ > + default: \ > + return -EFAULT; \ > + } \ > +}) This does not work for the signed SFrame fields, such as the FRE CFA, RA, and FP offsets, as it does not perform the required sign extension. One option would be to rename to UNSAFE_GET_USER_UNSIGNED_INC() and re-introduce UNSAFE_GET_USER_SIGNED_INC() using s8, s16, and s32. > +static __always_inline int __read_fre(struct sframe_section *sec, > + struct sframe_fde *fde, > + unsigned long fre_addr, > + struct sframe_fre *fre) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > + unsigned char offset_count, offset_size; > + s32 ip_off, cfa_off, ra_off, fp_off; The IP offset (from function start) in the SFrame V2 FRE is unsigned: u32 ip_off; s32 cfa_off, ra_off, fp_off; > + unsigned long cur = fre_addr; > + unsigned char addr_size; > + u8 info; > + > + addr_size = fre_type_to_size(fre_type); > + if (!addr_size) > + return -EFAULT; > + > + if (fre_addr + addr_size + 1 > sec->fres_end) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); Ok: The SFrame V2 FRE IP offset is unsigned u8, u16, or u32. > + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(info, cur, 1, Efault); Ok: The SFrame V2 FRE info word is one byte of data. > + offset_count = SFRAME_FRE_OFFSET_COUNT(info); > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); > + if (!offset_count || !offset_size) > + return -EFAULT; > + > + if (cur + (offset_count * offset_size) > sec->fres_end) > + return -EFAULT; > + > + fre->size = addr_size + 1 + (offset_count * offset_size); > + > + UNSAFE_GET_USER_INC(cfa_off, cur, offset_size, Efault); Issue: The SFrame V2 FRE CFA offset is signed s8, s16, or s32. Sign extension required when storing in s32. > + offset_count--; > + > + ra_off = sec->ra_off; > + if (!ra_off) { > + if (!offset_count--) > + return -EFAULT; > + > + UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); Issue: The SFrame V2 FRE RA offset is signed s8, s16, or s32. Sign extension required when storing in s32. > + } > + > + fp_off = sec->fp_off; > + if (!fp_off && offset_count) { > + offset_count--; > + UNSAFE_GET_USER_INC(fp_off, cur, offset_size, Efault); Issue: The SFrame V2 FRE FP offset is signed s8, s16, or s32. Sign extension required when storing in s32. > + } > + > + if (offset_count) > + return -EFAULT; > + > + fre->ip_off = ip_off; > + fre->cfa_off = cfa_off; > + fre->ra_off = ra_off; > + fre->fp_off = fp_off; > + fre->info = info; > + > + return 0; > + > +Efault: > + return -EFAULT; > +} > + > +static __always_inline int __find_fre(struct sframe_section *sec, > + struct sframe_fde *fde, unsigned long ip, > + struct unwind_user_frame *frame) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + struct sframe_fre *fre, *prev_fre = NULL; > + struct sframe_fre fres[2]; > + unsigned long fre_addr; > + bool which = false; > + unsigned int i; > + s32 ip_off; > + > + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; The IP offset (from function start) in the SFrame V2 FRE is unsigned. The function start address offset (from .sframe section begin) is signed. Therefore: u32 ip_off; ip_off = ip - (sec->sframe_start + fde->start_addr); > + > + if (fde_type == SFRAME_FDE_TYPE_PCMASK) > + ip_off %= fde->rep_size; Following is a patch with the suggested changes, that were required to make unwinding of user space using SFrame work on s390: diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c index bba14c5fe0f5..ea2d491ea68f 100644 --- a/kernel/unwind/sframe.c +++ b/kernel/unwind/sframe.c @@ -19,7 +19,7 @@ struct sframe_fre { unsigned int size; - s32 ip_off; + u32 ip_off; s32 cfa_off; s32 ra_off; s32 fp_off; @@ -136,7 +136,26 @@ static __always_inline int __find_fde(struct sframe_section *sec, to = (typeof(to))__to; \ }) -#define UNSAFE_GET_USER_INC(to, from, size, label) \ +#define UNSAFE_GET_USER_SIGNED_INC(to, from, size, label) \ +({ \ + switch (size) { \ + case 1: \ + __UNSAFE_GET_USER_INC(to, from, s8, label); \ + break; \ + case 2: \ + __UNSAFE_GET_USER_INC(to, from, s16, label); \ + break; \ + case 4: \ + __UNSAFE_GET_USER_INC(to, from, s32, label); \ + break; \ + default: \ + dbg_sec_uaccess("%d: bad UNSAFE_GET_USER_SIGNED__INC size %u\n",\ + __LINE__, size); \ + return -EFAULT; \ + } \ +}) + +#define UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label) \ ({ \ switch (size) { \ case 1: \ @@ -149,7 +168,7 @@ static __always_inline int __find_fde(struct sframe_section *sec, __UNSAFE_GET_USER_INC(to, from, u32, label); \ break; \ default: \ - dbg_sec_uaccess("%d: bad UNSAFE_GET_USER_INC size %u\n",\ + dbg_sec_uaccess("%d: bad UNSAFE_GET_USER_UNSIGNED_INC size %u\n",\ __LINE__, size); \ return -EFAULT; \ } \ @@ -163,7 +182,8 @@ static __always_inline int __read_fre(struct sframe_section *sec, unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); unsigned char offset_count, offset_size; - s32 ip_off, cfa_off, ra_off, fp_off; + u32 ip_off; + s32 cfa_off, ra_off, fp_off; unsigned long cur = fre_addr; unsigned char addr_size; u8 info; @@ -179,14 +199,14 @@ static __always_inline int __read_fre(struct sframe_section *sec, return -EFAULT; } - UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); + UNSAFE_GET_USER_UNSIGNED_INC(ip_off, cur, addr_size, Efault); if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) { dbg_sec_uaccess("fre starts past end of function: ip_off=0x%x, func_size=0x%x\n", ip_off, fde->func_size); return -EFAULT; } - UNSAFE_GET_USER_INC(info, cur, 1, Efault); + UNSAFE_GET_USER_UNSIGNED_INC(info, cur, 1, Efault); offset_count = SFRAME_FRE_OFFSET_COUNT(info); offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); if (!offset_count || !offset_size) { @@ -200,7 +220,7 @@ static __always_inline int __read_fre(struct sframe_section *sec, fre->size = addr_size + 1 + (offset_count * offset_size); - UNSAFE_GET_USER_INC(cfa_off, cur, offset_size, Efault); + UNSAFE_GET_USER_SIGNED_INC(cfa_off, cur, offset_size, Efault); offset_count--; ra_off = sec->ra_off; @@ -210,13 +230,13 @@ static __always_inline int __read_fre(struct sframe_section *sec, return -EFAULT; } - UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); + UNSAFE_GET_USER_SIGNED_INC(ra_off, cur, offset_size, Efault); } fp_off = sec->fp_off; if (!fp_off && offset_count) { offset_count--; - UNSAFE_GET_USER_INC(fp_off, cur, offset_size, Efault); + UNSAFE_GET_USER_SIGNED_INC(fp_off, cur, offset_size, Efault); } if (offset_count) { @@ -247,9 +267,9 @@ static __always_inline int __find_fre(struct sframe_section *sec, unsigned long fre_addr; bool which = false; unsigned int i; - s32 ip_off; + u32 ip_off; - ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; + ip_off = ip - (sec->sframe_start + fde->start_addr); if (fde_type == SFRAME_FDE_TYPE_PCMASK) ip_off %= fde->rep_size; Regards, Jens
On Fri, Jan 24, 2025 at 1:41 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > On Fri, Jan 24, 2025 at 10:02:46AM -0800, Andrii Nakryiko wrote: > > On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > +static __always_inline int __find_fre(struct sframe_section *sec, > > > + struct sframe_fde *fde, unsigned long ip, > > > + struct unwind_user_frame *frame) > > > +{ > > > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > > > + struct sframe_fre *fre, *prev_fre = NULL; > > > + struct sframe_fre fres[2]; > > > > you only need prev_fre->ip_off, so why all this `which` and `fres[2]` > > business if all you need is prev_fre_off and a bool whether you have > > prev_fre at all? > > So this cleverness probably needs a comment. prev_fre is actually > needed after the loop: > > > > + if (!prev_fre) > > > + return -EINVAL; > > > + fre = prev_fre; > > In the body of the loop, prev_fre is a tentative match, unless the next > fre also matches, in which case that one becomes the new tentative > match. > > I'll add a comment. Also it'll probably be less confusing if I rename > "prev_fre" to "fre", and "fre" to "next_fre". > Nit: swap() might be a simplify way to alternate pointers between two fre_addr[] entries. For example, static __always_inline int __find_fre(struct sframe_section *sec, struct sframe_fde *fde, unsigned long ip, struct unwind_user_frame *frame) { /* intialize fres[] with invalid value */ struct sframe_fre fres[2] = {0}; struct sframe_fre *fre = &fres[1], *prev_fre = fres; for (i = 0; i < fde->fres_num; i++) { swap(fre, next_fre); ret = __read_fre(sec, fde, fre_addr, fre); ... if (fre->ip_off > ip_off) break; } if (fre->size == 0) return -EINVAL; ... }
On Thu, Jan 30, 2025 at 07:07:32AM -0800, Indu Bhagat wrote: > On 1/21/25 6:31 PM, Josh Poimboeuf wrote: > > + for (i = 0; i < fde->fres_num; i++) { > > + int ret; > > + > > + /* > > + * Alternate between the two fre_addr[] entries for 'fre' and > > + * 'prev_fre'. > > + */ > > + fre = which ? fres : fres + 1; > > + which = !which; > > + > > + ret = __read_fre(sec, fde, fre_addr, fre); > > + if (ret) > > + return ret; > > + > > It should be possible to only read the ip_off and info from FRE and defer > the reading of offsets (as done in __read_fre) until later when you do need > the offsets. See below. > > We can find the relevant FRE with the following pieces of information: > - ip_off > - fre_size (this will mean we need to read the uin8_t info in the FRE) Indeed, I'll skip reading the offsets until after the loop.
On Thu, Jan 30, 2025 at 04:47:00PM +0100, Jens Remus wrote: > On 22.01.2025 03:31, Josh Poimboeuf wrote: > > +struct sframe_fre { > > + unsigned int size; > > + s32 ip_off; > > The IP offset (from function start) in the SFrame V2 FDE is unsigned: > > u32 ip_off; Indeed. > > +#define __UNSAFE_GET_USER_INC(to, from, type, label) \ > > +({ \ > > + type __to; \ > > + unsafe_get_user(__to, (type __user *)from, label); \ > > + from += sizeof(__to); \ > > + to = (typeof(to))__to; \ > > +}) > > + > > +#define UNSAFE_GET_USER_INC(to, from, size, label) \ > > +({ \ > > + switch (size) { \ > > + case 1: \ > > + __UNSAFE_GET_USER_INC(to, from, u8, label); \ > > + break; \ > > + case 2: \ > > + __UNSAFE_GET_USER_INC(to, from, u16, label); \ > > + break; \ > > + case 4: \ > > + __UNSAFE_GET_USER_INC(to, from, u32, label); \ > > + break; \ > > + default: \ > > + return -EFAULT; \ > > + } \ > > +}) > > This does not work for the signed SFrame fields, such as the FRE CFA, > RA, and FP offsets, as it does not perform the required sign extension. > One option would be to rename to UNSAFE_GET_USER_UNSIGNED_INC() and > re-introduce UNSAFE_GET_USER_SIGNED_INC() using s8, s16, and s32. See the following line in __UNSAFE_GET_USER_INC(): to = (typeof(to))__to; Does that not do the sign extension?
On Thu, Jan 30, 2025 at 07:51:15PM +0000, Weinan Liu wrote: > Nit: swap() might be a simplify way to alternate pointers between two > fre_addr[] entries. > > For example, > > static __always_inline int __find_fre(struct sframe_section *sec, > struct sframe_fde *fde, unsigned long ip, > struct unwind_user_frame *frame) > { > /* intialize fres[] with invalid value */ > struct sframe_fre fres[2] = {0}; > struct sframe_fre *fre = &fres[1], *prev_fre = fres; > > for (i = 0; i < fde->fres_num; i++) { > swap(fre, next_fre); > ret = __read_fre(sec, fde, fre_addr, fre); Problem is, if it breaks out early here on the first iteration: > if (fre->ip_off > ip_off) > break; > } > > if (fre->size == 0) > return -EINVAL; Then fre isn't valid even though it has a nonzero size.
On 04.02.2025 19:51, Josh Poimboeuf wrote: > On Thu, Jan 30, 2025 at 04:47:00PM +0100, Jens Remus wrote: >> On 22.01.2025 03:31, Josh Poimboeuf wrote: >>> +#define __UNSAFE_GET_USER_INC(to, from, type, label) \ >>> +({ \ >>> + type __to; \ >>> + unsafe_get_user(__to, (type __user *)from, label); \ >>> + from += sizeof(__to); \ >>> + to = (typeof(to))__to; \ >>> +}) >>> + >>> +#define UNSAFE_GET_USER_INC(to, from, size, label) \ >>> +({ \ >>> + switch (size) { \ >>> + case 1: \ >>> + __UNSAFE_GET_USER_INC(to, from, u8, label); \ >>> + break; \ >>> + case 2: \ >>> + __UNSAFE_GET_USER_INC(to, from, u16, label); \ >>> + break; \ >>> + case 4: \ >>> + __UNSAFE_GET_USER_INC(to, from, u32, label); \ >>> + break; \ >>> + default: \ >>> + return -EFAULT; \ >>> + } \ >>> +}) >> >> This does not work for the signed SFrame fields, such as the FRE CFA, >> RA, and FP offsets, as it does not perform the required sign extension. >> One option would be to rename to UNSAFE_GET_USER_UNSIGNED_INC() and >> re-introduce UNSAFE_GET_USER_SIGNED_INC() using s8, s16, and s32. > > See the following line in __UNSAFE_GET_USER_INC(): > > to = (typeof(to))__to; > > Does that not do the sign extension? No. In practice with my proposed changes reverted and the following debugging code added: @@ -293,6 +293,10 @@ static __always_inline int __find_fre(struct sframe_section *sec, return -EINVAL; fre = prev_fre; + dbg_sec_uaccess("fre: ip_off=%u, cfa_off=%d, ra_off=%d, fp_off=%d, use_fp=%s, sp_val_off=%d\n", + fre->ip_off, fre->cfa_off, fre->ra_off, fre->fp_off, + SFRAME_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP ? "y" : "n", + sframe_sp_val_off()); Excerpt from dmesg: sframe: /usr/lib/ld64.so.1: fre: ip_off=16, cfa_off=440, ra_off=208, fp_off=184, use_fp=n, sp_val_off=-160 sframe: /usr/lib/ld64.so.1: fre: ip_off=2600, cfa_off=672, ra_off=208, fp_off=184, use_fp=y, sp_val_off=-160 sframe: /usr/lib/ld64.so.1: fre: ip_off=10, cfa_off=368, ra_off=0, fp_off=0, use_fp=n, sp_val_off=-160 sframe: /usr/lib/ld64.so.1: fre: ip_off=722, cfa_off=672, ra_off=208, fp_off=184, use_fp=y, sp_val_off=-160 On s390 the register save slots have negative offsets from CFA (due to the CFA to be defined as SP at call site + 160). The RA, if saved, would be saved at CFA-48 on the stack. I.e. ra_off=-48 instead of ra_off=208 would have been correct. 208 = 0xd0 (unsigned) = -48 (signed) Looking at the code: UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); With offset_size=1 expands into: __UNSAFE_GET_USER_INC(/*to=*/ra_off, /*from=*cur, /*type=*/u8, /*label=*/Efault); Expands into: { u8 __to; unsafe_get_user(__to, (u8 __user *)cur, Efault); cur += sizeof(__to); ra_off = (typeof(ra_off))__to; } The issue is that on the last line __to is u8 instead of s8 and thus u8 gets casted to s32, which is performed without sign extension. __to would need to be s8 or get casted to s8 for sign extension to take place. Regards, Jens
On Wed, Feb 05, 2025 at 10:47:58AM +0100, Jens Remus wrote: > UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); > > With offset_size=1 expands into: > > __UNSAFE_GET_USER_INC(/*to=*/ra_off, /*from=*cur, /*type=*/u8, /*label=*/Efault); > > Expands into: > > { > u8 __to; > unsafe_get_user(__to, (u8 __user *)cur, Efault); > cur += sizeof(__to); > ra_off = (typeof(ra_off))__to; > } > > The issue is that on the last line __to is u8 instead of s8 and thus > u8 gets casted to s32, which is performed without sign extension. __to > would need to be s8 or get casted to s8 for sign extension to take > place. Ah, I get it now, thanks for spelling that out for me. Here's what I have at the moment: #define __UNSAFE_GET_USER_INC(to, from, type, label) \ ({ \ type __to; \ unsafe_get_user(__to, (type __user *)from, label); \ from += sizeof(__to); \ to = __to; \ }) #define __UNSAFE_GET_USER_INC(to, from, size, label, u_or_s) \ ({ \ switch (size) { \ case 1: \ __UNSAFE_GET_USER_INC(to, from, u_or_s##8, label); \ break; \ case 2: \ __UNSAFE_GET_USER_INC(to, from, u_or_s##16, label); \ break; \ case 4: \ __UNSAFE_GET_USER_INC(to, from, u_or_s##32, label); \ break; \ default: \ dbg_sec_uaccess("%d: bad unsafe_get_user() size %u\n", \ __LINE__, size); \ return -EFAULT; \ } \ }) #define UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label) \ __UNSAFE_GET_USER_INC(to, from, size, label, u) #define UNSAFE_GET_USER_SIGNED_INC(to, from, size, label) \ __UNSAFE_GET_USER_INC(to, from, size, label, s) #define UNSAFE_GET_USER_INC(to, from, size, label) \ _Generic(to, \ u8: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ u16: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ u32: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ s8: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \ s16: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \ s32: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label))
On 07.02.2025 22:06, Josh Poimboeuf wrote: > On Wed, Feb 05, 2025 at 10:47:58AM +0100, Jens Remus wrote: >> UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); >> >> With offset_size=1 expands into: >> >> __UNSAFE_GET_USER_INC(/*to=*/ra_off, /*from=*cur, /*type=*/u8, /*label=*/Efault); >> >> Expands into: >> >> { >> u8 __to; >> unsafe_get_user(__to, (u8 __user *)cur, Efault); >> cur += sizeof(__to); >> ra_off = (typeof(ra_off))__to; >> } >> >> The issue is that on the last line __to is u8 instead of s8 and thus >> u8 gets casted to s32, which is performed without sign extension. __to >> would need to be s8 or get casted to s8 for sign extension to take >> place. > > Ah, I get it now, thanks for spelling that out for me. > > Here's what I have at the moment: Thanks! Using your new UNSAFE_GET_USER_INC() in all places works great on s390 when resolving the duplicate macro names (see below). > > #define __UNSAFE_GET_USER_INC(to, from, type, label) \ > ({ \ > type __to; \ > unsafe_get_user(__to, (type __user *)from, label); \ > from += sizeof(__to); \ > to = __to; \ > }) > > #define __UNSAFE_GET_USER_INC(to, from, size, label, u_or_s) \ That does not compile. One of the macros needs to be renamed. CC kernel/unwind/sframe.o kernel/unwind/sframe.c:141:9: warning: "__UNSAFE_GET_USER_INC" redefined 141 | #define __UNSAFE_GET_USER_INC(to, from, size, label, u_or_s) \ | ^~~~~~~~~~~~~~~~~~~~~ kernel/unwind/sframe.c:133:9: note: this is the location of the previous definition 133 | #define __UNSAFE_GET_USER_INC(to, from, type, label) \ | ^~~~~~~~~~~~~~~~~~~~~ > ({ \ > switch (size) { \ > case 1: \ > __UNSAFE_GET_USER_INC(to, from, u_or_s##8, label); \ > break; \ > case 2: \ > __UNSAFE_GET_USER_INC(to, from, u_or_s##16, label); \ > break; \ > case 4: \ > __UNSAFE_GET_USER_INC(to, from, u_or_s##32, label); \ > break; \ > default: \ > dbg_sec_uaccess("%d: bad unsafe_get_user() size %u\n", \ > __LINE__, size); \ > return -EFAULT; \ > } \ > }) > > #define UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label) \ > __UNSAFE_GET_USER_INC(to, from, size, label, u) > > #define UNSAFE_GET_USER_SIGNED_INC(to, from, size, label) \ > __UNSAFE_GET_USER_INC(to, from, size, label, s) > > #define UNSAFE_GET_USER_INC(to, from, size, label) \ > _Generic(to, \ > u8: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ > u16: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ > u32: UNSAFE_GET_USER_UNSIGNED_INC(to, from, size, label), \ > s8: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \ > s16: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label), \ > s32: UNSAFE_GET_USER_SIGNED_INC(to, from, size, label)) Regards, Jens
diff --git a/include/linux/sframe.h b/include/linux/sframe.h index ff4b9d1dbd00..2e70085a1e89 100644 --- a/include/linux/sframe.h +++ b/include/linux/sframe.h @@ -3,11 +3,14 @@ #define _LINUX_SFRAME_H #include <linux/mm_types.h> +#include <linux/srcu.h> #include <linux/unwind_user_types.h> #ifdef CONFIG_HAVE_UNWIND_USER_SFRAME struct sframe_section { + struct rcu_head rcu; + unsigned long sframe_start; unsigned long sframe_end; unsigned long text_start; @@ -28,6 +31,7 @@ extern void sframe_free_mm(struct mm_struct *mm); extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end); extern int sframe_remove_section(unsigned long sframe_addr); +extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame); static inline bool current_has_sframe(void) { @@ -42,6 +46,7 @@ static inline bool current_has_sframe(void) static inline void sframe_free_mm(struct mm_struct *mm) {} static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; } static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; } +static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -ENOSYS; } static inline bool current_has_sframe(void) { return false; } #endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */ diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c index fa7d87ffd00a..1a35615a361e 100644 --- a/kernel/unwind/sframe.c +++ b/kernel/unwind/sframe.c @@ -15,9 +15,287 @@ #include <linux/unwind_user_types.h> #include "sframe.h" +#include "sframe_debug.h" -#define dbg(fmt, ...) \ - pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__) +struct sframe_fre { + unsigned int size; + s32 ip_off; + s32 cfa_off; + s32 ra_off; + s32 fp_off; + u8 info; +}; + +DEFINE_STATIC_SRCU(sframe_srcu); + +static __always_inline unsigned char fre_type_to_size(unsigned char fre_type) +{ + if (fre_type > 2) + return 0; + return 1 << fre_type; +} + +static __always_inline unsigned char offset_size_enum_to_size(unsigned char off_size) +{ + if (off_size > 2) + return 0; + return 1 << off_size; +} + +static __always_inline int __read_fde(struct sframe_section *sec, + unsigned int fde_num, + struct sframe_fde *fde) +{ + unsigned long fde_addr, ip; + + fde_addr = sec->fdes_start + (fde_num * sizeof(struct sframe_fde)); + unsafe_copy_from_user(fde, (void __user *)fde_addr, + sizeof(struct sframe_fde), Efault); + + ip = sec->sframe_start + fde->start_addr; + if (ip < sec->text_start || ip > sec->text_end) + return -EINVAL; + + return 0; + +Efault: + return -EFAULT; +} + +static __always_inline int __find_fde(struct sframe_section *sec, + unsigned long ip, + struct sframe_fde *fde) +{ + s32 ip_off, func_off_low = S32_MIN, func_off_high = S32_MAX; + struct sframe_fde __user *first, *low, *high, *found = NULL; + int ret; + + ip_off = ip - sec->sframe_start; + + first = (void __user *)sec->fdes_start; + low = first; + high = first + sec->num_fdes - 1; + + while (low <= high) { + struct sframe_fde __user *mid; + s32 func_off; + + mid = low + ((high - low) / 2); + + unsafe_get_user(func_off, (s32 __user *)mid, Efault); + + if (ip_off >= func_off) { + if (func_off < func_off_low) + return -EFAULT; + + func_off_low = func_off; + + found = mid; + low = mid + 1; + } else { + if (func_off > func_off_high) + return -EFAULT; + + func_off_high = func_off; + + high = mid - 1; + } + } + + if (!found) + return -EINVAL; + + ret = __read_fde(sec, found - first, fde); + if (ret) + return ret; + + /* make sure it's not in a gap */ + if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->func_size) + return -EINVAL; + + return 0; + +Efault: + return -EFAULT; +} + +#define __UNSAFE_GET_USER_INC(to, from, type, label) \ +({ \ + type __to; \ + unsafe_get_user(__to, (type __user *)from, label); \ + from += sizeof(__to); \ + to = (typeof(to))__to; \ +}) + +#define UNSAFE_GET_USER_INC(to, from, size, label) \ +({ \ + switch (size) { \ + case 1: \ + __UNSAFE_GET_USER_INC(to, from, u8, label); \ + break; \ + case 2: \ + __UNSAFE_GET_USER_INC(to, from, u16, label); \ + break; \ + case 4: \ + __UNSAFE_GET_USER_INC(to, from, u32, label); \ + break; \ + default: \ + return -EFAULT; \ + } \ +}) + +static __always_inline int __read_fre(struct sframe_section *sec, + struct sframe_fde *fde, + unsigned long fre_addr, + struct sframe_fre *fre) +{ + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); + unsigned char offset_count, offset_size; + s32 ip_off, cfa_off, ra_off, fp_off; + unsigned long cur = fre_addr; + unsigned char addr_size; + u8 info; + + addr_size = fre_type_to_size(fre_type); + if (!addr_size) + return -EFAULT; + + if (fre_addr + addr_size + 1 > sec->fres_end) + return -EFAULT; + + UNSAFE_GET_USER_INC(ip_off, cur, addr_size, Efault); + if (fde_type == SFRAME_FDE_TYPE_PCINC && ip_off > fde->func_size) + return -EFAULT; + + UNSAFE_GET_USER_INC(info, cur, 1, Efault); + offset_count = SFRAME_FRE_OFFSET_COUNT(info); + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(info)); + if (!offset_count || !offset_size) + return -EFAULT; + + if (cur + (offset_count * offset_size) > sec->fres_end) + return -EFAULT; + + fre->size = addr_size + 1 + (offset_count * offset_size); + + UNSAFE_GET_USER_INC(cfa_off, cur, offset_size, Efault); + offset_count--; + + ra_off = sec->ra_off; + if (!ra_off) { + if (!offset_count--) + return -EFAULT; + + UNSAFE_GET_USER_INC(ra_off, cur, offset_size, Efault); + } + + fp_off = sec->fp_off; + if (!fp_off && offset_count) { + offset_count--; + UNSAFE_GET_USER_INC(fp_off, cur, offset_size, Efault); + } + + if (offset_count) + return -EFAULT; + + fre->ip_off = ip_off; + fre->cfa_off = cfa_off; + fre->ra_off = ra_off; + fre->fp_off = fp_off; + fre->info = info; + + return 0; + +Efault: + return -EFAULT; +} + +static __always_inline int __find_fre(struct sframe_section *sec, + struct sframe_fde *fde, unsigned long ip, + struct unwind_user_frame *frame) +{ + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); + struct sframe_fre *fre, *prev_fre = NULL; + struct sframe_fre fres[2]; + unsigned long fre_addr; + bool which = false; + unsigned int i; + s32 ip_off; + + ip_off = (s32)(ip - sec->sframe_start) - fde->start_addr; + + if (fde_type == SFRAME_FDE_TYPE_PCMASK) + ip_off %= fde->rep_size; + + fre_addr = sec->fres_start + fde->fres_off; + + for (i = 0; i < fde->fres_num; i++) { + int ret; + + /* + * Alternate between the two fre_addr[] entries for 'fre' and + * 'prev_fre'. + */ + fre = which ? fres : fres + 1; + which = !which; + + ret = __read_fre(sec, fde, fre_addr, fre); + if (ret) + return ret; + + fre_addr += fre->size; + + if (prev_fre && fre->ip_off <= prev_fre->ip_off) + return -EFAULT; + + if (fre->ip_off > ip_off) + break; + + prev_fre = fre; + } + + if (!prev_fre) + return -EINVAL; + fre = prev_fre; + + frame->cfa_off = fre->cfa_off; + frame->ra_off = fre->ra_off; + frame->fp_off = fre->fp_off; + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre->info) == SFRAME_BASE_REG_FP; + + return 0; +} + +int sframe_find(unsigned long ip, struct unwind_user_frame *frame) +{ + struct mm_struct *mm = current->mm; + struct sframe_section *sec; + struct sframe_fde fde; + int ret; + + if (!mm) + return -EINVAL; + + guard(srcu)(&sframe_srcu); + + sec = mtree_load(&mm->sframe_mt, ip); + if (!sec) + return -EINVAL; + + if (!user_read_access_begin((void __user *)sec->sframe_start, + sec->sframe_end - sec->sframe_start)) + return -EFAULT; + + ret = __find_fde(sec, ip, &fde); + if (ret) + goto end; + + ret = __find_fre(sec, &fde, ip, frame); +end: + user_read_access_end(); + return ret; +} static void free_section(struct sframe_section *sec) { @@ -119,8 +397,10 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, sec->text_end = text_end; ret = sframe_read_header(sec); - if (ret) + if (ret) { + dbg_print_header(sec); goto err_free; + } ret = mtree_insert_range(sframe_mt, sec->text_start, sec->text_end, sec, GFP_KERNEL); if (ret) { @@ -136,6 +416,13 @@ int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, return ret; } +static void sframe_free_srcu(struct rcu_head *rcu) +{ + struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu); + + free_section(sec); +} + static int __sframe_remove_section(struct mm_struct *mm, struct sframe_section *sec) { @@ -144,7 +431,7 @@ static int __sframe_remove_section(struct mm_struct *mm, return -EINVAL; } - free_section(sec); + call_srcu(&sframe_srcu, &sec->rcu, sframe_free_srcu); return 0; } diff --git a/kernel/unwind/sframe_debug.h b/kernel/unwind/sframe_debug.h new file mode 100644 index 000000000000..055c8c8fae24 --- /dev/null +++ b/kernel/unwind/sframe_debug.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SFRAME_DEBUG_H +#define _SFRAME_DEBUG_H + +#include <linux/sframe.h> +#include "sframe.h" + +#ifdef CONFIG_DYNAMIC_DEBUG + +#define dbg(fmt, ...) \ + pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__) + +static __always_inline void dbg_print_header(struct sframe_section *sec) +{ + unsigned long fdes_end; + + fdes_end = sec->fdes_start + (sec->num_fdes * sizeof(struct sframe_fde)); + + dbg("SEC: sframe:0x%lx-0x%lx text:0x%lx-0x%lx " + "fdes:0x%lx-0x%lx fres:0x%lx-0x%lx " + "ra_off:%d fp_off:%d\n", + sec->sframe_start, sec->sframe_end, sec->text_start, sec->text_end, + sec->fdes_start, fdes_end, sec->fres_start, sec->fres_end, + sec->ra_off, sec->fp_off); +} + +#else /* !CONFIG_DYNAMIC_DEBUG */ + +#define dbg(args...) no_printk(args) + +static inline void dbg_print_header(struct sframe_section *sec) {} + +#endif /* !CONFIG_DYNAMIC_DEBUG */ + +#endif /* _SFRAME_DEBUG_H */
In preparation for using sframe to unwind user space stacks, add an sframe_find() interface for finding the sframe information associated with a given text address. For performance, use user_read_access_begin() and the corresponding unsafe_*() accessors. Note that use of pr_debug() in uaccess-enabled regions would break noinstr validation, so there aren't any debug messages yet. That will be added in a subsequent commit. Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> --- include/linux/sframe.h | 5 + kernel/unwind/sframe.c | 295 ++++++++++++++++++++++++++++++++++- kernel/unwind/sframe_debug.h | 35 +++++ 3 files changed, 331 insertions(+), 4 deletions(-) create mode 100644 kernel/unwind/sframe_debug.h