Message ID | 20240801-arm64-gcs-v10-20-699e2bd2190b@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64/gcs: Provide support for GCS in userspace | expand |
On Thu, Aug 01, 2024 at 01:06:47PM +0100, Mark Brown wrote: > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index 5f00cb0da9c3..d6d3a96cf2e4 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -285,9 +285,32 @@ static void flush_gcs(void) > write_sysreg_s(0, SYS_GCSPR_EL0); > } > > +static int copy_thread_gcs(struct task_struct *p, > + const struct kernel_clone_args *args) > +{ > + unsigned long gcs; > + > + gcs = gcs_alloc_thread_stack(p, args); > + if (IS_ERR_VALUE(gcs)) > + return PTR_ERR((void *)gcs); Is 0 an ok value here? I can see further down that gcs_alloc_thread_stack() may return 0. > + > + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; > + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; > + > + /* Ensure the current state of the GCS is seen by CoW */ > + gcsb_dsync(); I don't get this barrier. What does it have to do with CoW, which memory effects is it trying to order? > diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c > index b0a67efc522b..b71f6b408513 100644 > --- a/arch/arm64/mm/gcs.c > +++ b/arch/arm64/mm/gcs.c > @@ -8,6 +8,138 @@ > #include <asm/cpufeature.h> > #include <asm/page.h> > > +static unsigned long alloc_gcs(unsigned long addr, unsigned long size) > +{ > + int flags = MAP_ANONYMOUS | MAP_PRIVATE; > + struct mm_struct *mm = current->mm; > + unsigned long mapped_addr, unused; > + > + if (addr) > + flags |= MAP_FIXED_NOREPLACE; > + > + mmap_write_lock(mm); > + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, > + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); > + mmap_write_unlock(mm); > + > + return mapped_addr; > +} > + > +static unsigned long gcs_size(unsigned long size) > +{ > + if (size) > + return PAGE_ALIGN(size); > + > + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ > + size = PAGE_ALIGN(min_t(unsigned long long, > + rlimit(RLIMIT_STACK) / 2, SZ_2G)); > + return max(PAGE_SIZE, size); > +} So we still have RLIMIT_STACK/2. I thought we got rid of that and just went with RLIMIT_STACK (or I misremember). > + > +static bool gcs_consume_token(struct mm_struct *mm, unsigned long user_addr) > +{ > + u64 expected = GCS_CAP(user_addr); > + u64 val; > + int ret; > + > + /* This should really be an atomic cmpxchg. It is not. */ > + ret = access_remote_vm(mm, user_addr, &val, sizeof(val), > + FOLL_FORCE); > + if (ret != sizeof(val)) > + return false; > + > + if (val != expected) > + return false; > + > + val = 0; > + ret = access_remote_vm(mm, user_addr, &val, sizeof(val), > + FOLL_FORCE | FOLL_WRITE); > + if (ret != sizeof(val)) > + return false; > + > + return true; > +} As per the clone3() thread, I think we should try to use get_user_page_vma_remote() and do a cmpxchg() directly. How does the user write the initial token? Do we need any barriers before/after consuming the token?
On Mon, Aug 19, 2024 at 01:04:18PM +0100, Catalin Marinas wrote: > On Thu, Aug 01, 2024 at 01:06:47PM +0100, Mark Brown wrote: > > +static int copy_thread_gcs(struct task_struct *p, > > + const struct kernel_clone_args *args) > > +{ > > + unsigned long gcs; > > + > > + gcs = gcs_alloc_thread_stack(p, args); > > + if (IS_ERR_VALUE(gcs)) > > + return PTR_ERR((void *)gcs); > Is 0 an ok value here? I can see further down that > gcs_alloc_thread_stack() may return 0. Yes, it's fine for a thread not to have a GCS. > > + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; > > + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; > > + /* Ensure the current state of the GCS is seen by CoW */ > > + gcsb_dsync(); > I don't get this barrier. What does it have to do with CoW, which memory > effects is it trying to order? Yeah, I can't remember what that's supposed to be protecting. > > + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ > > + size = PAGE_ALIGN(min_t(unsigned long long, > > + rlimit(RLIMIT_STACK) / 2, SZ_2G)); > > + return max(PAGE_SIZE, size); > > +} > So we still have RLIMIT_STACK/2. I thought we got rid of that and just > went with RLIMIT_STACK (or I misremember). I honestly can't remember either way, it's quite possible it's changed multiple times. I don't have super strong feelings on the particular value here. > > +static bool gcs_consume_token(struct mm_struct *mm, unsigned long user_addr) > > +{ > As per the clone3() thread, I think we should try to use > get_user_page_vma_remote() and do a cmpxchg() directly. I've left this as is for now, mainly because it keeps the code in line with x86 and I can't directly test the x86 code. IIRC we can't just do a standard userspace cmpxchg since that will access as though we were at EL0 but EL0 doesn't have standard write permission for the page. > How does the user write the initial token? Do we need any barriers > before/after consuming the token? The token is created by map_shadow_stack() or as part of a GCS pivot. A sync beforehand is probably safer, with the current code we'll have one when we switch to the task.
On Mon, Aug 19, 2024 at 04:57:08PM +0100, Mark Brown wrote: > On Mon, Aug 19, 2024 at 01:04:18PM +0100, Catalin Marinas wrote: > > On Thu, Aug 01, 2024 at 01:06:47PM +0100, Mark Brown wrote: > > > +static int copy_thread_gcs(struct task_struct *p, > > > + const struct kernel_clone_args *args) > > > +{ > > > + unsigned long gcs; > > > + > > > + gcs = gcs_alloc_thread_stack(p, args); > > > + if (IS_ERR_VALUE(gcs)) > > > + return PTR_ERR((void *)gcs); > > > Is 0 an ok value here? I can see further down that > > gcs_alloc_thread_stack() may return 0. > > Yes, it's fine for a thread not to have a GCS. OK, so we only get a 0 here if the gcs_{base,size} has not be initialised. Looks fine. > > > + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; > > > + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; > > > > + /* Ensure the current state of the GCS is seen by CoW */ > > > + gcsb_dsync(); > > > I don't get this barrier. What does it have to do with CoW, which memory > > effects is it trying to order? > > Yeah, I can't remember what that's supposed to be protecting. The GCS memory writes in the parent must indeed be visible in the child that could start on a different CPU. So, in principle, we need some form of ordering similar to the context switch. However, in case of classic fork(), the child won't be started until the PTEs have been made read-only and a TLBI issued. This would ensure the completion of any GCS memory accesses in the parent (at least that's my reading of the Arm ARM). If we have normal thread creation without CoW, is the parent writing anything to the stack that the new thread needs to observe? The map_shadow_stack() call will cause a GCSSTTR and this wouldn't be ordered with subsequent memory writes. But we already have a GCSB DSYNC in map_shadow_stack() after put_user_gcs(). My conclusion is that we don't need this barrier. > > > + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ > > > + size = PAGE_ALIGN(min_t(unsigned long long, > > > + rlimit(RLIMIT_STACK) / 2, SZ_2G)); > > > + return max(PAGE_SIZE, size); > > > +} > > > So we still have RLIMIT_STACK/2. I thought we got rid of that and just > > went with RLIMIT_STACK (or I misremember). > > I honestly can't remember either way, it's quite possible it's changed > multiple times. I don't have super strong feelings on the particular > value here. The half size looks a lot more arbitrary to me than picking the same size as the stack. So I'd go with RLIMIT_STACK. > > > +static bool gcs_consume_token(struct mm_struct *mm, unsigned long user_addr) > > > +{ > > > As per the clone3() thread, I think we should try to use > > get_user_page_vma_remote() and do a cmpxchg() directly. > > I've left this as is for now, mainly because it keeps the code in line > with x86 and I can't directly test the x86 code. I thought for the clone3() x86 code we'll need the remote vma, so we have to use the get_user_page_vma_remote() API anyway. > IIRC we can't just do > a standard userspace cmpxchg since that will access as though we were at > EL0 but EL0 doesn't have standard write permission for the page. Correct but GUP goes through the kernel mapping, not the user one. So get_user_page_vma_remote() returns a page and you just do a classic cmpxchg() at page_address() (plus some offset).
diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 04594ef59dad..c1f274fdb9c0 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -8,6 +8,8 @@ #include <asm/types.h> #include <asm/uaccess.h> +struct kernel_clone_args; + static inline void gcsb_dsync(void) { asm volatile(".inst 0xd503227f" : : : "memory"); @@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) void gcs_set_el0_mode(struct task_struct *task); void gcs_free(struct task_struct *task); void gcs_preserve_current_state(void); +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args); #else @@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + return -ENOTSUPP; +} #endif diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5f00cb0da9c3..d6d3a96cf2e4 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -285,9 +285,32 @@ static void flush_gcs(void) write_sysreg_s(0, SYS_GCSPR_EL0); } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + /* Ensure the current state of the GCS is seen by CoW */ + gcsb_dsync(); + + return 0; +} + #else static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} #endif @@ -303,6 +326,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -366,6 +390,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -407,6 +432,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.uw.tp_value = tls; p->thread.tpidr2_el0 = 0; } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index b0a67efc522b..b71f6b408513 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -8,6 +8,138 @@ #include <asm/cpufeature.h> #include <asm/page.h> +static unsigned long alloc_gcs(unsigned long addr, unsigned long size) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + return mapped_addr; +} + +static unsigned long gcs_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ + size = PAGE_ALIGN(min_t(unsigned long long, + rlimit(RLIMIT_STACK) / 2, SZ_2G)); + return max(PAGE_SIZE, size); +} + +static bool gcs_consume_token(struct mm_struct *mm, unsigned long user_addr) +{ + u64 expected = GCS_CAP(user_addr); + u64 val; + int ret; + + /* This should really be an atomic cmpxchg. It is not. */ + ret = access_remote_vm(mm, user_addr, &val, sizeof(val), + FOLL_FORCE); + if (ret != sizeof(val)) + return false; + + if (val != expected) + return false; + + val = 0; + ret = access_remote_vm(mm, user_addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); + if (ret != sizeof(val)) + return false; + + return true; +} + +int arch_shstk_post_fork(struct task_struct *tsk, + struct kernel_clone_args *args) +{ + struct mm_struct *mm; + unsigned long addr, size, gcspr_el0; + int ret = 0; + + mm = get_task_mm(tsk); + if (!mm) + return -EFAULT; + + addr = args->shadow_stack; + size = args->shadow_stack_size; + + /* + * There should be a token, and there is likely to be an optional + * end of stack marker above it. + */ + gcspr_el0 = addr + size - (2 * sizeof(u64)); + if (!gcs_consume_token(mm, gcspr_el0)) { + gcspr_el0 += sizeof(u64); + if (!gcs_consume_token(mm, gcspr_el0)) { + ret = -EINVAL; + goto out; + } + } + + tsk->thread.gcspr_el0 = gcspr_el0 + sizeof(u64); + +out: + mmput(mm); + + return ret; +} + +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + unsigned long addr, size; + + /* If the user specified a GCS use it. */ + if (args->shadow_stack_size) { + if (!system_supports_gcs()) + return (unsigned long)ERR_PTR(-EINVAL); + + /* GCSPR_EL0 will be set up when verifying token post fork */ + addr = args->shadow_stack; + } else { + + /* + * Otherwise fall back to legacy clone() support and + * implicitly allocate a GCS if we need a new one. + */ + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(tsk)) + return 0; + + if ((args->flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) { + tsk->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + return 0; + } + + size = args->stack_size; + + size = gcs_size(size); + addr = alloc_gcs(0, size); + if (IS_ERR_VALUE(addr)) + return addr; + + tsk->thread.gcs_base = addr; + tsk->thread.gcs_size = size; + tsk->thread.gcspr_el0 = addr + size - sizeof(u64); + } + + return addr; +} + /* * Apply the GCS mode configured for the specified task to the * hardware. @@ -30,6 +162,16 @@ void gcs_set_el0_mode(struct task_struct *task) void gcs_free(struct task_struct *task) { + + /* + * When fork() with CLONE_VM fails, the child (tsk) already + * has a GCS allocated, and exit_thread() calls this function + * to free it. In this case the parent (current) and the + * child share the same mm struct. + */ + if (!task->mm || task->mm != current->mm) + return; + if (task->thread.gcs_base) vm_munmap(task->thread.gcs_base, task->thread.gcs_size);