Message ID | 20210511144252.3779113-2-tabba@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Tidy up cache.S | expand |
Hi Fuad, On Tue, May 11, 2021 at 03:42:40PM +0100, Fuad Tabba wrote: > __flush_icache_range works on the kernel linear map, and doesn't > need uaccess. The existing code is a side-effect of its current > implementation with __flush_cache_user_range fallthrough. > > Instead of fallthrough to share the code, use a common macro for > the two where the caller can specify whether user-space access is > needed. FWIW, I agree that we should fix __flush_icache_range to not map fiddle with uaccess, and that we should split these. > No functional change intended. There is a performance change here, since the existing `__flush_cache_user_range` takes IDC and DIC into account, whereas `invalidate_icache_by_line` does not. There's also an existing oversight where `__flush_cache_user_range` takes ARM64_WORKAROUND_CLEAN_CACHE into account, but `invalidate_icache_by_line` does not. I think that's a bug that we should fix first, so that we can backport something to stable. Arguably similar is true in `swsusp_arch_suspend_exit`, but for that we could add a comment and always use `DC CIVAC`. Thanks, Mark. > Reported-by: Catalin Marinas <catalin.marinas@arm.com> > Reported-by: Will Deacon <will@kernel.org> > Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ > Signed-off-by: Fuad Tabba <tabba@google.com> > --- > arch/arm64/include/asm/assembler.h | 13 ++++-- > arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- > 2 files changed, 54 insertions(+), 23 deletions(-) > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > index 8418c1bd8f04..6ff7a3a3b238 100644 > --- a/arch/arm64/include/asm/assembler.h > +++ b/arch/arm64/include/asm/assembler.h > @@ -426,16 +426,21 @@ alternative_endif > * Macro to perform an instruction cache maintenance for the interval > * [start, end) > * > - * start, end: virtual addresses describing the region > - * label: A label to branch to on user fault. > - * Corrupts: tmp1, tmp2 > + * start, end: virtual addresses describing the region > + * needs_uaccess: might access user space memory > + * label: label to branch to on user fault (if needs_uaccess) > + * Corrupts: tmp1, tmp2 > */ > - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label > + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label > icache_line_size \tmp1, \tmp2 > sub \tmp2, \tmp1, #1 > bic \tmp2, \start, \tmp2 > 9997: > + .if \needs_uaccess > USER(\label, ic ivau, \tmp2) // invalidate I line PoU > + .else > + ic ivau, \tmp2 > + .endif > add \tmp2, \tmp2, \tmp1 > cmp \tmp2, \end > b.lo 9997b > diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S > index 2d881f34dd9d..092f73acdf9a 100644 > --- a/arch/arm64/mm/cache.S > +++ b/arch/arm64/mm/cache.S > @@ -15,30 +15,20 @@ > #include <asm/asm-uaccess.h> > > /* > - * flush_icache_range(start,end) > + * __flush_cache_range(start,end) [needs_uaccess] > * > * Ensure that the I and D caches are coherent within specified region. > * This is typically used when code has been written to a memory region, > * and will be executed. > * > - * - start - virtual start address of region > - * - end - virtual end address of region > + * - start - virtual start address of region > + * - end - virtual end address of region > + * - needs_uaccess - (macro parameter) might access user space memory > */ > -SYM_FUNC_START(__flush_icache_range) > - /* FALLTHROUGH */ > - > -/* > - * __flush_cache_user_range(start,end) > - * > - * Ensure that the I and D caches are coherent within specified region. > - * This is typically used when code has been written to a memory region, > - * and will be executed. > - * > - * - start - virtual start address of region > - * - end - virtual end address of region > - */ > -SYM_FUNC_START(__flush_cache_user_range) > +.macro __flush_cache_range, needs_uaccess > + .if \needs_uaccess > uaccess_ttbr0_enable x2, x3, x4 > + .endif > alternative_if ARM64_HAS_CACHE_IDC > dsb ishst > b 7f > @@ -47,7 +37,11 @@ alternative_else_nop_endif > sub x3, x2, #1 > bic x4, x0, x3 > 1: > + .if \needs_uaccess > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > + .else > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > + .endif > add x4, x4, x2 > cmp x4, x1 > b.lo 1b > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > isb > b 8f > alternative_else_nop_endif > - invalidate_icache_by_line x0, x1, x2, x3, 9f > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > 8: mov x0, #0 > 1: > + .if \needs_uaccess > uaccess_ttbr0_disable x1, x2 > + .endif > ret > + > + .if \needs_uaccess > 9: > mov x0, #-EFAULT > b 1b > + .endif > +.endm > + > +/* > + * flush_icache_range(start,end) > + * > + * Ensure that the I and D caches are coherent within specified region. > + * This is typically used when code has been written to a memory region, > + * and will be executed. > + * > + * - start - virtual start address of region > + * - end - virtual end address of region > + */ > +SYM_FUNC_START(__flush_icache_range) > + __flush_cache_range needs_uaccess=0 > SYM_FUNC_END(__flush_icache_range) > + > +/* > + * __flush_cache_user_range(start,end) > + * > + * Ensure that the I and D caches are coherent within specified region. > + * This is typically used when code has been written to a memory region, > + * and will be executed. > + * > + * - start - virtual start address of region > + * - end - virtual end address of region > + */ > +SYM_FUNC_START(__flush_cache_user_range) > + __flush_cache_range needs_uaccess=1 > SYM_FUNC_END(__flush_cache_user_range) > > /* > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > uaccess_ttbr0_enable x2, x3, x4 > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > mov x0, xzr > 1: > uaccess_ttbr0_disable x1, x2 > -- > 2.31.1.607.g51e8a6a459-goog >
On 2021-05-11 15:42, Fuad Tabba wrote: > __flush_icache_range works on the kernel linear map, and doesn't > need uaccess. The existing code is a side-effect of its current > implementation with __flush_cache_user_range fallthrough. > > Instead of fallthrough to share the code, use a common macro for > the two where the caller can specify whether user-space access is > needed. > > No functional change intended. > > Reported-by: Catalin Marinas <catalin.marinas@arm.com> > Reported-by: Will Deacon <will@kernel.org> > Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ > Signed-off-by: Fuad Tabba <tabba@google.com> > --- > arch/arm64/include/asm/assembler.h | 13 ++++-- > arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- > 2 files changed, 54 insertions(+), 23 deletions(-) > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > index 8418c1bd8f04..6ff7a3a3b238 100644 > --- a/arch/arm64/include/asm/assembler.h > +++ b/arch/arm64/include/asm/assembler.h > @@ -426,16 +426,21 @@ alternative_endif > * Macro to perform an instruction cache maintenance for the interval > * [start, end) > * > - * start, end: virtual addresses describing the region > - * label: A label to branch to on user fault. > - * Corrupts: tmp1, tmp2 > + * start, end: virtual addresses describing the region > + * needs_uaccess: might access user space memory > + * label: label to branch to on user fault (if needs_uaccess) > + * Corrupts: tmp1, tmp2 > */ > - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label > + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label > icache_line_size \tmp1, \tmp2 > sub \tmp2, \tmp1, #1 > bic \tmp2, \start, \tmp2 > 9997: > + .if \needs_uaccess > USER(\label, ic ivau, \tmp2) // invalidate I line PoU > + .else > + ic ivau, \tmp2 > + .endif > add \tmp2, \tmp2, \tmp1 > cmp \tmp2, \end > b.lo 9997b > diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S > index 2d881f34dd9d..092f73acdf9a 100644 > --- a/arch/arm64/mm/cache.S > +++ b/arch/arm64/mm/cache.S > @@ -15,30 +15,20 @@ > #include <asm/asm-uaccess.h> > > /* > - * flush_icache_range(start,end) > + * __flush_cache_range(start,end) [needs_uaccess] > * > * Ensure that the I and D caches are coherent within specified region. > * This is typically used when code has been written to a memory region, > * and will be executed. > * > - * - start - virtual start address of region > - * - end - virtual end address of region > + * - start - virtual start address of region > + * - end - virtual end address of region > + * - needs_uaccess - (macro parameter) might access user space memory > */ > -SYM_FUNC_START(__flush_icache_range) > - /* FALLTHROUGH */ > - > -/* > - * __flush_cache_user_range(start,end) > - * > - * Ensure that the I and D caches are coherent within specified region. > - * This is typically used when code has been written to a memory region, > - * and will be executed. > - * > - * - start - virtual start address of region > - * - end - virtual end address of region > - */ > -SYM_FUNC_START(__flush_cache_user_range) > +.macro __flush_cache_range, needs_uaccess > + .if \needs_uaccess > uaccess_ttbr0_enable x2, x3, x4 > + .endif Nit: this feels like it belongs directly in __flush_cache_user_range() rather than being hidden in the macro, since it's not really an integral part of the cache maintenance operation itself. Robin. > alternative_if ARM64_HAS_CACHE_IDC > dsb ishst > b 7f > @@ -47,7 +37,11 @@ alternative_else_nop_endif > sub x3, x2, #1 > bic x4, x0, x3 > 1: > + .if \needs_uaccess > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > + .else > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > + .endif > add x4, x4, x2 > cmp x4, x1 > b.lo 1b > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > isb > b 8f > alternative_else_nop_endif > - invalidate_icache_by_line x0, x1, x2, x3, 9f > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > 8: mov x0, #0 > 1: > + .if \needs_uaccess > uaccess_ttbr0_disable x1, x2 > + .endif > ret > + > + .if \needs_uaccess > 9: > mov x0, #-EFAULT > b 1b > + .endif > +.endm > + > +/* > + * flush_icache_range(start,end) > + * > + * Ensure that the I and D caches are coherent within specified region. > + * This is typically used when code has been written to a memory region, > + * and will be executed. > + * > + * - start - virtual start address of region > + * - end - virtual end address of region > + */ > +SYM_FUNC_START(__flush_icache_range) > + __flush_cache_range needs_uaccess=0 > SYM_FUNC_END(__flush_icache_range) > + > +/* > + * __flush_cache_user_range(start,end) > + * > + * Ensure that the I and D caches are coherent within specified region. > + * This is typically used when code has been written to a memory region, > + * and will be executed. > + * > + * - start - virtual start address of region > + * - end - virtual end address of region > + */ > +SYM_FUNC_START(__flush_cache_user_range) > + __flush_cache_range needs_uaccess=1 > SYM_FUNC_END(__flush_cache_user_range) > > /* > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > uaccess_ttbr0_enable x2, x3, x4 > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > mov x0, xzr > 1: > uaccess_ttbr0_disable x1, x2 >
Hi Mark, > > No functional change intended. > > There is a performance change here, since the existing > `__flush_cache_user_range` takes IDC and DIC into account, whereas > `invalidate_icache_by_line` does not. You're right. There is a performance change in this patch and a couple of the others, which I will note in v2. However, I don't think that this patch changes the behavior when it comes to IDC and DIC, does it? > There's also an existing oversight where `__flush_cache_user_range` > takes ARM64_WORKAROUND_CLEAN_CACHE into account, but > `invalidate_icache_by_line` does not. I think that's a bug that we > should fix first, so that we can backport something to stable. I'd be happy to address that in v2, but let me make sure I understand the issue properly. Errata 819472 and friends (ARM64_WORKAROUND_CLEAN_CACHE) are related to cache maintenance operations on data caches happening concurrently with other accesses to the same address. The two places invalidate_icache_by_line is used in conjunction with data caches are __flush_icache_range and __flush_cache_user_range (which share the same code before and after my patch series). In both cases, invalidate_icache_by_line is called after the workaround is applied. The third and only other user of invalidate_icache_by_line is invalidate_icache_range, which only performs cache maintenance on the icache. The concern is that invalidate_icache_range might be performing a cache maintenance operation on an address concurrently with another processor performing a dc operation on the same address. Therefore, invalidate_icache_range should perform DC CIVAC on the line before invalidate_icache_by_line if ARM64_WORKAROUND_CLEAN_CACHE applies. Is that right? https://documentation-service.arm.com/static/5fa29fddb209f547eebd361d > Arguably > similar is true in `swsusp_arch_suspend_exit`, but for that we could add > a comment and always use `DC CIVAC`. I can do that in v2 as well. Thanks, /fuad > Thanks, > Mark. > > > Reported-by: Catalin Marinas <catalin.marinas@arm.com> > > Reported-by: Will Deacon <will@kernel.org> > > Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ > > Signed-off-by: Fuad Tabba <tabba@google.com> > > --- > > arch/arm64/include/asm/assembler.h | 13 ++++-- > > arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- > > 2 files changed, 54 insertions(+), 23 deletions(-) > > > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > > index 8418c1bd8f04..6ff7a3a3b238 100644 > > --- a/arch/arm64/include/asm/assembler.h > > +++ b/arch/arm64/include/asm/assembler.h > > @@ -426,16 +426,21 @@ alternative_endif > > * Macro to perform an instruction cache maintenance for the interval > > * [start, end) > > * > > - * start, end: virtual addresses describing the region > > - * label: A label to branch to on user fault. > > - * Corrupts: tmp1, tmp2 > > + * start, end: virtual addresses describing the region > > + * needs_uaccess: might access user space memory > > + * label: label to branch to on user fault (if needs_uaccess) > > + * Corrupts: tmp1, tmp2 > > */ > > - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label > > + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label > > icache_line_size \tmp1, \tmp2 > > sub \tmp2, \tmp1, #1 > > bic \tmp2, \start, \tmp2 > > 9997: > > + .if \needs_uaccess > > USER(\label, ic ivau, \tmp2) // invalidate I line PoU > > + .else > > + ic ivau, \tmp2 > > + .endif > > add \tmp2, \tmp2, \tmp1 > > cmp \tmp2, \end > > b.lo 9997b > > diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S > > index 2d881f34dd9d..092f73acdf9a 100644 > > --- a/arch/arm64/mm/cache.S > > +++ b/arch/arm64/mm/cache.S > > @@ -15,30 +15,20 @@ > > #include <asm/asm-uaccess.h> > > > > /* > > - * flush_icache_range(start,end) > > + * __flush_cache_range(start,end) [needs_uaccess] > > * > > * Ensure that the I and D caches are coherent within specified region. > > * This is typically used when code has been written to a memory region, > > * and will be executed. > > * > > - * - start - virtual start address of region > > - * - end - virtual end address of region > > + * - start - virtual start address of region > > + * - end - virtual end address of region > > + * - needs_uaccess - (macro parameter) might access user space memory > > */ > > -SYM_FUNC_START(__flush_icache_range) > > - /* FALLTHROUGH */ > > - > > -/* > > - * __flush_cache_user_range(start,end) > > - * > > - * Ensure that the I and D caches are coherent within specified region. > > - * This is typically used when code has been written to a memory region, > > - * and will be executed. > > - * > > - * - start - virtual start address of region > > - * - end - virtual end address of region > > - */ > > -SYM_FUNC_START(__flush_cache_user_range) > > +.macro __flush_cache_range, needs_uaccess > > + .if \needs_uaccess > > uaccess_ttbr0_enable x2, x3, x4 > > + .endif > > alternative_if ARM64_HAS_CACHE_IDC > > dsb ishst > > b 7f > > @@ -47,7 +37,11 @@ alternative_else_nop_endif > > sub x3, x2, #1 > > bic x4, x0, x3 > > 1: > > + .if \needs_uaccess > > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > + .else > > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > + .endif > > add x4, x4, x2 > > cmp x4, x1 > > b.lo 1b > > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > > isb > > b 8f > > alternative_else_nop_endif > > - invalidate_icache_by_line x0, x1, x2, x3, 9f > > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > > 8: mov x0, #0 > > 1: > > + .if \needs_uaccess > > uaccess_ttbr0_disable x1, x2 > > + .endif > > ret > > + > > + .if \needs_uaccess > > 9: > > mov x0, #-EFAULT > > b 1b > > + .endif > > +.endm > > + > > +/* > > + * flush_icache_range(start,end) > > + * > > + * Ensure that the I and D caches are coherent within specified region. > > + * This is typically used when code has been written to a memory region, > > + * and will be executed. > > + * > > + * - start - virtual start address of region > > + * - end - virtual end address of region > > + */ > > +SYM_FUNC_START(__flush_icache_range) > > + __flush_cache_range needs_uaccess=0 > > SYM_FUNC_END(__flush_icache_range) > > + > > +/* > > + * __flush_cache_user_range(start,end) > > + * > > + * Ensure that the I and D caches are coherent within specified region. > > + * This is typically used when code has been written to a memory region, > > + * and will be executed. > > + * > > + * - start - virtual start address of region > > + * - end - virtual end address of region > > + */ > > +SYM_FUNC_START(__flush_cache_user_range) > > + __flush_cache_range needs_uaccess=1 > > SYM_FUNC_END(__flush_cache_user_range) > > > > /* > > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > > > uaccess_ttbr0_enable x2, x3, x4 > > > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > > mov x0, xzr > > 1: > > uaccess_ttbr0_disable x1, x2 > > -- > > 2.31.1.607.g51e8a6a459-goog > >
Hi Robin, > > -SYM_FUNC_START(__flush_cache_user_range) > > +.macro __flush_cache_range, needs_uaccess > > + .if \needs_uaccess > > uaccess_ttbr0_enable x2, x3, x4 > > + .endif > > Nit: this feels like it belongs directly in __flush_cache_user_range() > rather than being hidden in the macro, since it's not really an integral > part of the cache maintenance operation itself. I will fix this in v2. Thanks, /fuad > Robin. > > > alternative_if ARM64_HAS_CACHE_IDC > > dsb ishst > > b 7f > > @@ -47,7 +37,11 @@ alternative_else_nop_endif > > sub x3, x2, #1 > > bic x4, x0, x3 > > 1: > > + .if \needs_uaccess > > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > + .else > > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > + .endif > > add x4, x4, x2 > > cmp x4, x1 > > b.lo 1b > > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > > isb > > b 8f > > alternative_else_nop_endif > > - invalidate_icache_by_line x0, x1, x2, x3, 9f > > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > > 8: mov x0, #0 > > 1: > > + .if \needs_uaccess > > uaccess_ttbr0_disable x1, x2 > > + .endif > > ret > > + > > + .if \needs_uaccess > > 9: > > mov x0, #-EFAULT > > b 1b > > + .endif > > +.endm > > + > > +/* > > + * flush_icache_range(start,end) > > + * > > + * Ensure that the I and D caches are coherent within specified region. > > + * This is typically used when code has been written to a memory region, > > + * and will be executed. > > + * > > + * - start - virtual start address of region > > + * - end - virtual end address of region > > + */ > > +SYM_FUNC_START(__flush_icache_range) > > + __flush_cache_range needs_uaccess=0 > > SYM_FUNC_END(__flush_icache_range) > > + > > +/* > > + * __flush_cache_user_range(start,end) > > + * > > + * Ensure that the I and D caches are coherent within specified region. > > + * This is typically used when code has been written to a memory region, > > + * and will be executed. > > + * > > + * - start - virtual start address of region > > + * - end - virtual end address of region > > + */ > > +SYM_FUNC_START(__flush_cache_user_range) > > + __flush_cache_range needs_uaccess=1 > > SYM_FUNC_END(__flush_cache_user_range) > > > > /* > > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > > > uaccess_ttbr0_enable x2, x3, x4 > > > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > > mov x0, xzr > > 1: > > uaccess_ttbr0_disable x1, x2 > >
On Wed, May 12, 2021 at 09:52:28AM +0100, Fuad Tabba wrote: > Hi Mark, > > > > No functional change intended. > > > > There is a performance change here, since the existing > > `__flush_cache_user_range` takes IDC and DIC into account, whereas > > `invalidate_icache_by_line` does not. > > You're right. There is a performance change in this patch and a couple > of the others, which I will note in v2. However, I don't think that > this patch changes the behavior when it comes to IDC and DIC, does it? It shouldn't be a functional problem, but it means that the new `__flush_icache_range` will always perform redundant I-cache maintenance rather than skipping this when the cpu has DIC=1. It would be nice if we could structure this to take DIC into account either in the new `__flush_icache_range`, or in the `invalidate_icache_by_line` helper. > > There's also an existing oversight where `__flush_cache_user_range` > > takes ARM64_WORKAROUND_CLEAN_CACHE into account, but > > `invalidate_icache_by_line` does not. Sorry about this. I was evidently confused, as this does not make any sense. This doesn't matter to `invalidate_icache_by_line`, and `invalidate_dcache_by_line` already does the right thing via `__dcache_op_workaround_clean_cache`. > I'd be happy to address that in v2, but let me make sure I understand > the issue properly. > > Errata 819472 and friends (ARM64_WORKAROUND_CLEAN_CACHE) are related > to cache maintenance operations on data caches happening concurrently > with other accesses to the same address. The two places > invalidate_icache_by_line is used in conjunction with data caches are > __flush_icache_range and __flush_cache_user_range (which share the > same code before and after my patch series). In both cases, > invalidate_icache_by_line is called after the workaround is applied. > The third and only other user of invalidate_icache_by_line is > invalidate_icache_range, which only performs cache maintenance on the > icache. > > The concern is that invalidate_icache_range might be performing a > cache maintenance operation on an address concurrently with another > processor performing a dc operation on the same address. Therefore, > invalidate_icache_range should perform DC CIVAC on the line before > invalidate_icache_by_line if ARM64_WORKAROUND_CLEAN_CACHE applies. Is > that right? > > https://documentation-service.arm.com/static/5fa29fddb209f547eebd361d Sorry, I had misread the code, and I don't think there's a bug to fix here after all. Regardless, thanks for digging into that and trying to make sense of my bogus suggestion. > > Arguably similar is true in `swsusp_arch_suspend_exit`, but for that > > we could add a comment and always use `DC CIVAC`. > > I can do that in v2 as well. A separate patch for `swsusp_arch_suspend_exit` would be great, since that is something we should backport to stable as a fix. Thanks, Mark. > > > Reported-by: Catalin Marinas <catalin.marinas@arm.com> > > > Reported-by: Will Deacon <will@kernel.org> > > > Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ > > > Signed-off-by: Fuad Tabba <tabba@google.com> > > > --- > > > arch/arm64/include/asm/assembler.h | 13 ++++-- > > > arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- > > > 2 files changed, 54 insertions(+), 23 deletions(-) > > > > > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > > > index 8418c1bd8f04..6ff7a3a3b238 100644 > > > --- a/arch/arm64/include/asm/assembler.h > > > +++ b/arch/arm64/include/asm/assembler.h > > > @@ -426,16 +426,21 @@ alternative_endif > > > * Macro to perform an instruction cache maintenance for the interval > > > * [start, end) > > > * > > > - * start, end: virtual addresses describing the region > > > - * label: A label to branch to on user fault. > > > - * Corrupts: tmp1, tmp2 > > > + * start, end: virtual addresses describing the region > > > + * needs_uaccess: might access user space memory > > > + * label: label to branch to on user fault (if needs_uaccess) > > > + * Corrupts: tmp1, tmp2 > > > */ > > > - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label > > > + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label > > > icache_line_size \tmp1, \tmp2 > > > sub \tmp2, \tmp1, #1 > > > bic \tmp2, \start, \tmp2 > > > 9997: > > > + .if \needs_uaccess > > > USER(\label, ic ivau, \tmp2) // invalidate I line PoU > > > + .else > > > + ic ivau, \tmp2 > > > + .endif > > > add \tmp2, \tmp2, \tmp1 > > > cmp \tmp2, \end > > > b.lo 9997b > > > diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S > > > index 2d881f34dd9d..092f73acdf9a 100644 > > > --- a/arch/arm64/mm/cache.S > > > +++ b/arch/arm64/mm/cache.S > > > @@ -15,30 +15,20 @@ > > > #include <asm/asm-uaccess.h> > > > > > > /* > > > - * flush_icache_range(start,end) > > > + * __flush_cache_range(start,end) [needs_uaccess] > > > * > > > * Ensure that the I and D caches are coherent within specified region. > > > * This is typically used when code has been written to a memory region, > > > * and will be executed. > > > * > > > - * - start - virtual start address of region > > > - * - end - virtual end address of region > > > + * - start - virtual start address of region > > > + * - end - virtual end address of region > > > + * - needs_uaccess - (macro parameter) might access user space memory > > > */ > > > -SYM_FUNC_START(__flush_icache_range) > > > - /* FALLTHROUGH */ > > > - > > > -/* > > > - * __flush_cache_user_range(start,end) > > > - * > > > - * Ensure that the I and D caches are coherent within specified region. > > > - * This is typically used when code has been written to a memory region, > > > - * and will be executed. > > > - * > > > - * - start - virtual start address of region > > > - * - end - virtual end address of region > > > - */ > > > -SYM_FUNC_START(__flush_cache_user_range) > > > +.macro __flush_cache_range, needs_uaccess > > > + .if \needs_uaccess > > > uaccess_ttbr0_enable x2, x3, x4 > > > + .endif > > > alternative_if ARM64_HAS_CACHE_IDC > > > dsb ishst > > > b 7f > > > @@ -47,7 +37,11 @@ alternative_else_nop_endif > > > sub x3, x2, #1 > > > bic x4, x0, x3 > > > 1: > > > + .if \needs_uaccess > > > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > > + .else > > > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > > + .endif > > > add x4, x4, x2 > > > cmp x4, x1 > > > b.lo 1b > > > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > > > isb > > > b 8f > > > alternative_else_nop_endif > > > - invalidate_icache_by_line x0, x1, x2, x3, 9f > > > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > > > 8: mov x0, #0 > > > 1: > > > + .if \needs_uaccess > > > uaccess_ttbr0_disable x1, x2 > > > + .endif > > > ret > > > + > > > + .if \needs_uaccess > > > 9: > > > mov x0, #-EFAULT > > > b 1b > > > + .endif > > > +.endm > > > + > > > +/* > > > + * flush_icache_range(start,end) > > > + * > > > + * Ensure that the I and D caches are coherent within specified region. > > > + * This is typically used when code has been written to a memory region, > > > + * and will be executed. > > > + * > > > + * - start - virtual start address of region > > > + * - end - virtual end address of region > > > + */ > > > +SYM_FUNC_START(__flush_icache_range) > > > + __flush_cache_range needs_uaccess=0 > > > SYM_FUNC_END(__flush_icache_range) > > > + > > > +/* > > > + * __flush_cache_user_range(start,end) > > > + * > > > + * Ensure that the I and D caches are coherent within specified region. > > > + * This is typically used when code has been written to a memory region, > > > + * and will be executed. > > > + * > > > + * - start - virtual start address of region > > > + * - end - virtual end address of region > > > + */ > > > +SYM_FUNC_START(__flush_cache_user_range) > > > + __flush_cache_range needs_uaccess=1 > > > SYM_FUNC_END(__flush_cache_user_range) > > > > > > /* > > > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > > > > > uaccess_ttbr0_enable x2, x3, x4 > > > > > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > > > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > > > mov x0, xzr > > > 1: > > > uaccess_ttbr0_disable x1, x2 > > > -- > > > 2.31.1.607.g51e8a6a459-goog > > >
Hi Mark, On Wed, May 12, 2021 at 10:59 AM Mark Rutland <mark.rutland@arm.com> wrote: > > On Wed, May 12, 2021 at 09:52:28AM +0100, Fuad Tabba wrote: > > Hi Mark, > > > > > > No functional change intended. > > > > > > There is a performance change here, since the existing > > > `__flush_cache_user_range` takes IDC and DIC into account, whereas > > > `invalidate_icache_by_line` does not. > > > > You're right. There is a performance change in this patch and a couple > > of the others, which I will note in v2. However, I don't think that > > this patch changes the behavior when it comes to IDC and DIC, does it? > > It shouldn't be a functional problem, but it means that the new > `__flush_icache_range` will always perform redundant I-cache maintenance > rather than skipping this when the cpu has DIC=1. Sorry, but I can't quite see how this patch is making a difference in that regard. The existing code has __flush_icache_range fallthrough to __flush_cache_user_range, where the alternative_if ARM64_HAS_CACHE_{IDC,DIC} are invoked. In this patch, __flush_icache_range and __flush_cache_user_range share the same code via the macro, where the alternative_ifs and branches over invalidate_icache_by_line are still there and behave the same: the macro jumps to 8 if ARM64_HAS_CACHE_DIC, avoiding any redundant cache maintenance. Am I missing something else? > It would be nice if we could structure this to take DIC into account > either in the new `__flush_icache_range`, or in the > `invalidate_icache_by_line` helper. > > > Arguably similar is true in `swsusp_arch_suspend_exit`, but for that > > > we could add a comment and always use `DC CIVAC`. > > > > I can do that in v2 as well. > > A separate patch for `swsusp_arch_suspend_exit` would be great, since > that is something we should backport to stable as a fix. Will do. Thanks, /fuad > Thanks, > Mark. > > > > > Reported-by: Catalin Marinas <catalin.marinas@arm.com> > > > > Reported-by: Will Deacon <will@kernel.org> > > > > Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ > > > > Signed-off-by: Fuad Tabba <tabba@google.com> > > > > --- > > > > arch/arm64/include/asm/assembler.h | 13 ++++-- > > > > arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- > > > > 2 files changed, 54 insertions(+), 23 deletions(-) > > > > > > > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > > > > index 8418c1bd8f04..6ff7a3a3b238 100644 > > > > --- a/arch/arm64/include/asm/assembler.h > > > > +++ b/arch/arm64/include/asm/assembler.h > > > > @@ -426,16 +426,21 @@ alternative_endif > > > > * Macro to perform an instruction cache maintenance for the interval > > > > * [start, end) > > > > * > > > > - * start, end: virtual addresses describing the region > > > > - * label: A label to branch to on user fault. > > > > - * Corrupts: tmp1, tmp2 > > > > + * start, end: virtual addresses describing the region > > > > + * needs_uaccess: might access user space memory > > > > + * label: label to branch to on user fault (if needs_uaccess) > > > > + * Corrupts: tmp1, tmp2 > > > > */ > > > > - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label > > > > + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label > > > > icache_line_size \tmp1, \tmp2 > > > > sub \tmp2, \tmp1, #1 > > > > bic \tmp2, \start, \tmp2 > > > > 9997: > > > > + .if \needs_uaccess > > > > USER(\label, ic ivau, \tmp2) // invalidate I line PoU > > > > + .else > > > > + ic ivau, \tmp2 > > > > + .endif > > > > add \tmp2, \tmp2, \tmp1 > > > > cmp \tmp2, \end > > > > b.lo 9997b > > > > diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S > > > > index 2d881f34dd9d..092f73acdf9a 100644 > > > > --- a/arch/arm64/mm/cache.S > > > > +++ b/arch/arm64/mm/cache.S > > > > @@ -15,30 +15,20 @@ > > > > #include <asm/asm-uaccess.h> > > > > > > > > /* > > > > - * flush_icache_range(start,end) > > > > + * __flush_cache_range(start,end) [needs_uaccess] > > > > * > > > > * Ensure that the I and D caches are coherent within specified region. > > > > * This is typically used when code has been written to a memory region, > > > > * and will be executed. > > > > * > > > > - * - start - virtual start address of region > > > > - * - end - virtual end address of region > > > > + * - start - virtual start address of region > > > > + * - end - virtual end address of region > > > > + * - needs_uaccess - (macro parameter) might access user space memory > > > > */ > > > > -SYM_FUNC_START(__flush_icache_range) > > > > - /* FALLTHROUGH */ > > > > - > > > > -/* > > > > - * __flush_cache_user_range(start,end) > > > > - * > > > > - * Ensure that the I and D caches are coherent within specified region. > > > > - * This is typically used when code has been written to a memory region, > > > > - * and will be executed. > > > > - * > > > > - * - start - virtual start address of region > > > > - * - end - virtual end address of region > > > > - */ > > > > -SYM_FUNC_START(__flush_cache_user_range) > > > > +.macro __flush_cache_range, needs_uaccess > > > > + .if \needs_uaccess > > > > uaccess_ttbr0_enable x2, x3, x4 > > > > + .endif > > > > alternative_if ARM64_HAS_CACHE_IDC > > > > dsb ishst > > > > b 7f > > > > @@ -47,7 +37,11 @@ alternative_else_nop_endif > > > > sub x3, x2, #1 > > > > bic x4, x0, x3 > > > > 1: > > > > + .if \needs_uaccess > > > > user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > > > + .else > > > > +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE > > > > + .endif > > > > add x4, x4, x2 > > > > cmp x4, x1 > > > > b.lo 1b > > > > @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC > > > > isb > > > > b 8f > > > > alternative_else_nop_endif > > > > - invalidate_icache_by_line x0, x1, x2, x3, 9f > > > > + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f > > > > 8: mov x0, #0 > > > > 1: > > > > + .if \needs_uaccess > > > > uaccess_ttbr0_disable x1, x2 > > > > + .endif > > > > ret > > > > + > > > > + .if \needs_uaccess > > > > 9: > > > > mov x0, #-EFAULT > > > > b 1b > > > > + .endif > > > > +.endm > > > > + > > > > +/* > > > > + * flush_icache_range(start,end) > > > > + * > > > > + * Ensure that the I and D caches are coherent within specified region. > > > > + * This is typically used when code has been written to a memory region, > > > > + * and will be executed. > > > > + * > > > > + * - start - virtual start address of region > > > > + * - end - virtual end address of region > > > > + */ > > > > +SYM_FUNC_START(__flush_icache_range) > > > > + __flush_cache_range needs_uaccess=0 > > > > SYM_FUNC_END(__flush_icache_range) > > > > + > > > > +/* > > > > + * __flush_cache_user_range(start,end) > > > > + * > > > > + * Ensure that the I and D caches are coherent within specified region. > > > > + * This is typically used when code has been written to a memory region, > > > > + * and will be executed. > > > > + * > > > > + * - start - virtual start address of region > > > > + * - end - virtual end address of region > > > > + */ > > > > +SYM_FUNC_START(__flush_cache_user_range) > > > > + __flush_cache_range needs_uaccess=1 > > > > SYM_FUNC_END(__flush_cache_user_range) > > > > > > > > /* > > > > @@ -86,7 +112,7 @@ alternative_else_nop_endif > > > > > > > > uaccess_ttbr0_enable x2, x3, x4 > > > > > > > > - invalidate_icache_by_line x0, x1, x2, x3, 2f > > > > + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f > > > > mov x0, xzr > > > > 1: > > > > uaccess_ttbr0_disable x1, x2 > > > > -- > > > > 2.31.1.607.g51e8a6a459-goog > > > >
On Wed, May 12, 2021 at 11:29:53AM +0100, Fuad Tabba wrote: > Hi Mark, > > On Wed, May 12, 2021 at 10:59 AM Mark Rutland <mark.rutland@arm.com> wrote: > > > > On Wed, May 12, 2021 at 09:52:28AM +0100, Fuad Tabba wrote: > > > Hi Mark, > > > > > > > > No functional change intended. > > > > > > > > There is a performance change here, since the existing > > > > `__flush_cache_user_range` takes IDC and DIC into account, whereas > > > > `invalidate_icache_by_line` does not. > > > > > > You're right. There is a performance change in this patch and a couple > > > of the others, which I will note in v2. However, I don't think that > > > this patch changes the behavior when it comes to IDC and DIC, does it? > > > > It shouldn't be a functional problem, but it means that the new > > `__flush_icache_range` will always perform redundant I-cache maintenance > > rather than skipping this when the cpu has DIC=1. > > Sorry, but I can't quite see how this patch is making a difference in > that regard. The existing code has __flush_icache_range fallthrough to > __flush_cache_user_range, where the alternative_if > ARM64_HAS_CACHE_{IDC,DIC} are invoked. > > In this patch, __flush_icache_range and __flush_cache_user_range share > the same code via the macro, where the alternative_ifs and branches > over invalidate_icache_by_line are still there and behave the same: > the macro jumps to 8 if ARM64_HAS_CACHE_DIC, avoiding any redundant > cache maintenance. > > Am I missing something else? No; you're absolutely right. I had misread the patch and thought the IDC/DIC parts didn't go into the common macro. That all looks fine. Sorry again for the noise. Thanks, Mark.
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 8418c1bd8f04..6ff7a3a3b238 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -426,16 +426,21 @@ alternative_endif * Macro to perform an instruction cache maintenance for the interval * [start, end) * - * start, end: virtual addresses describing the region - * label: A label to branch to on user fault. - * Corrupts: tmp1, tmp2 + * start, end: virtual addresses describing the region + * needs_uaccess: might access user space memory + * label: label to branch to on user fault (if needs_uaccess) + * Corrupts: tmp1, tmp2 */ - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label + .macro invalidate_icache_by_line start, end, tmp1, tmp2, needs_uaccess, label icache_line_size \tmp1, \tmp2 sub \tmp2, \tmp1, #1 bic \tmp2, \start, \tmp2 9997: + .if \needs_uaccess USER(\label, ic ivau, \tmp2) // invalidate I line PoU + .else + ic ivau, \tmp2 + .endif add \tmp2, \tmp2, \tmp1 cmp \tmp2, \end b.lo 9997b diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 2d881f34dd9d..092f73acdf9a 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -15,30 +15,20 @@ #include <asm/asm-uaccess.h> /* - * flush_icache_range(start,end) + * __flush_cache_range(start,end) [needs_uaccess] * * Ensure that the I and D caches are coherent within specified region. * This is typically used when code has been written to a memory region, * and will be executed. * - * - start - virtual start address of region - * - end - virtual end address of region + * - start - virtual start address of region + * - end - virtual end address of region + * - needs_uaccess - (macro parameter) might access user space memory */ -SYM_FUNC_START(__flush_icache_range) - /* FALLTHROUGH */ - -/* - * __flush_cache_user_range(start,end) - * - * Ensure that the I and D caches are coherent within specified region. - * This is typically used when code has been written to a memory region, - * and will be executed. - * - * - start - virtual start address of region - * - end - virtual end address of region - */ -SYM_FUNC_START(__flush_cache_user_range) +.macro __flush_cache_range, needs_uaccess + .if \needs_uaccess uaccess_ttbr0_enable x2, x3, x4 + .endif alternative_if ARM64_HAS_CACHE_IDC dsb ishst b 7f @@ -47,7 +37,11 @@ alternative_else_nop_endif sub x3, x2, #1 bic x4, x0, x3 1: + .if \needs_uaccess user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE + .else +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE + .endif add x4, x4, x2 cmp x4, x1 b.lo 1b @@ -58,15 +52,47 @@ alternative_if ARM64_HAS_CACHE_DIC isb b 8f alternative_else_nop_endif - invalidate_icache_by_line x0, x1, x2, x3, 9f + invalidate_icache_by_line x0, x1, x2, x3, \needs_uaccess, 9f 8: mov x0, #0 1: + .if \needs_uaccess uaccess_ttbr0_disable x1, x2 + .endif ret + + .if \needs_uaccess 9: mov x0, #-EFAULT b 1b + .endif +.endm + +/* + * flush_icache_range(start,end) + * + * Ensure that the I and D caches are coherent within specified region. + * This is typically used when code has been written to a memory region, + * and will be executed. + * + * - start - virtual start address of region + * - end - virtual end address of region + */ +SYM_FUNC_START(__flush_icache_range) + __flush_cache_range needs_uaccess=0 SYM_FUNC_END(__flush_icache_range) + +/* + * __flush_cache_user_range(start,end) + * + * Ensure that the I and D caches are coherent within specified region. + * This is typically used when code has been written to a memory region, + * and will be executed. + * + * - start - virtual start address of region + * - end - virtual end address of region + */ +SYM_FUNC_START(__flush_cache_user_range) + __flush_cache_range needs_uaccess=1 SYM_FUNC_END(__flush_cache_user_range) /* @@ -86,7 +112,7 @@ alternative_else_nop_endif uaccess_ttbr0_enable x2, x3, x4 - invalidate_icache_by_line x0, x1, x2, x3, 2f + invalidate_icache_by_line x0, x1, x2, x3, 1, 2f mov x0, xzr 1: uaccess_ttbr0_disable x1, x2
__flush_icache_range works on the kernel linear map, and doesn't need uaccess. The existing code is a side-effect of its current implementation with __flush_cache_user_range fallthrough. Instead of fallthrough to share the code, use a common macro for the two where the caller can specify whether user-space access is needed. No functional change intended. Reported-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/linux-arch/20200511110014.lb9PEahJ4hVOYrbwIb_qUHXyNy9KQzNFdb_I3YlzY6A@z/ Signed-off-by: Fuad Tabba <tabba@google.com> --- arch/arm64/include/asm/assembler.h | 13 ++++-- arch/arm64/mm/cache.S | 64 +++++++++++++++++++++--------- 2 files changed, 54 insertions(+), 23 deletions(-)