Message ID | 20250128101306.1475491-1-julian@outer-limits.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v2] sh: Remove IO memcpy and memset from sh code | expand |
On Tue, Jan 28, 2025, at 11:13, Julian Vetter wrote: > Remove IO memcpy and memset from sh specific code and fall back to the > new implementations from lib/iomem_copy.c. They use word accesses if the > buffers are aligned and only fall back to byte accesses for potentially > unaligned parts of a buffer. > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > --- > Changes for V2: > - Removed also SH4 specific memcpy_fromio code Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Hi Julian, On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote: > Remove IO memcpy and memset from sh specific code and fall back to the > new implementations from lib/iomem_copy.c. They use word accesses if the > buffers are aligned and only fall back to byte accesses for potentially > unaligned parts of a buffer. > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > --- > Changes for V2: > - Removed also SH4 specific memcpy_fromio code > --- > arch/sh/include/asm/io.h | 8 --- > arch/sh/kernel/io.c | 111 --------------------------------------- > 2 files changed, 119 deletions(-) > delete mode 100644 arch/sh/kernel/io.c > > diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h > index cf5eab840d57..1e33a1c8b72d 100644 > --- a/arch/sh/include/asm/io.h > +++ b/arch/sh/include/asm/io.h > @@ -268,14 +268,6 @@ __BUILD_IOPORT_STRING(q, u64) > > #define IO_SPACE_LIMIT 0xffffffff > > -/* We really want to try and get these to memcpy etc */ > -#define memset_io memset_io > -#define memcpy_fromio memcpy_fromio > -#define memcpy_toio memcpy_toio > -void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); > -void memcpy_toio(volatile void __iomem *, const void *, unsigned long); > -void memset_io(volatile void __iomem *, int, unsigned long); > - > /* Quad-word real-mode I/O, don't ask.. */ > unsigned long long peek_real_address_q(unsigned long long addr); > unsigned long long poke_real_address_q(unsigned long long addr, > diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c > deleted file mode 100644 > index da22f3b32d30..000000000000 > --- a/arch/sh/kernel/io.c > +++ /dev/null > @@ -1,111 +0,0 @@ > -// SPDX-License-Identifier: GPL-2.0 > -/* > - * arch/sh/kernel/io.c - Machine independent I/O functions. > - * > - * Copyright (C) 2000 - 2009 Stuart Menefy > - * Copyright (C) 2005 Paul Mundt > - */ > -#include <linux/module.h> > -#include <linux/pci.h> > -#include <asm/machvec.h> > -#include <asm/io.h> > - > -/* > - * Copy data from IO memory space to "real" memory space. > - */ > -void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) > -{ > - /* > - * Would it be worthwhile doing byte and long transfers first > - * to try and get aligned? > - */ > -#ifdef CONFIG_CPU_SH4 > - if ((count >= 0x20) && > - (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) { > - int tmp2, tmp3, tmp4, tmp5, tmp6; > - > - __asm__ __volatile__( > - "1: \n\t" > - "mov.l @%7+, r0 \n\t" > - "mov.l @%7+, %2 \n\t" > - "movca.l r0, @%0 \n\t" > - "mov.l @%7+, %3 \n\t" > - "mov.l @%7+, %4 \n\t" > - "mov.l @%7+, %5 \n\t" > - "mov.l @%7+, %6 \n\t" > - "mov.l @%7+, r7 \n\t" > - "mov.l @%7+, r0 \n\t" > - "mov.l %2, @(0x04,%0) \n\t" > - "mov #0x20, %2 \n\t" > - "mov.l %3, @(0x08,%0) \n\t" > - "sub %2, %1 \n\t" > - "mov.l %4, @(0x0c,%0) \n\t" > - "cmp/hi %1, %2 ! T if 32 > count \n\t" > - "mov.l %5, @(0x10,%0) \n\t" > - "mov.l %6, @(0x14,%0) \n\t" > - "mov.l r7, @(0x18,%0) \n\t" > - "mov.l r0, @(0x1c,%0) \n\t" > - "bf.s 1b \n\t" > - " add #0x20, %0 \n\t" > - : "=&r" (to), "=&r" (count), > - "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), > - "=&r" (tmp5), "=&r" (tmp6), "=&r" (from) > - : "7"(from), "0" (to), "1" (count) > - : "r0", "r7", "t", "memory"); > - } > -#endif > - > - if ((((u32)to | (u32)from) & 0x3) == 0) { > - for (; count > 3; count -= 4) { > - *(u32 *)to = *(volatile u32 *)from; > - to += 4; > - from += 4; > - } > - } > - > - for (; count > 0; count--) { > - *(u8 *)to = *(volatile u8 *)from; > - to++; > - from++; > - } > - > - mb(); > -} > -EXPORT_SYMBOL(memcpy_fromio); > - > -/* > - * Copy data from "real" memory space to IO memory space. > - */ > -void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) > -{ > - if ((((u32)to | (u32)from) & 0x3) == 0) { > - for ( ; count > 3; count -= 4) { > - *(volatile u32 *)to = *(u32 *)from; > - to += 4; > - from += 4; > - } > - } > - > - for (; count > 0; count--) { > - *(volatile u8 *)to = *(u8 *)from; > - to++; > - from++; > - } > - > - mb(); > -} > -EXPORT_SYMBOL(memcpy_toio); > - > -/* > - * "memset" on IO memory space. > - * This needs to be optimized. > - */ > -void memset_io(volatile void __iomem *dst, int c, unsigned long count) > -{ > - while (count) { > - count--; > - writeb(c, dst); > - dst++; > - } > -} > -EXPORT_SYMBOL(memset_io); I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler code for memset and drop it in favor of potentially slower generic C code. What is the reasoning behind this? Also, it seems that this patch would make your other patch "sh: Remove memset_io from sh specific code" obsolete. Geert, do you have any opinions on this patch? Thanks, Adrian
Hi Adrian, On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> wrote: > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote: > > Remove IO memcpy and memset from sh specific code and fall back to the > > new implementations from lib/iomem_copy.c. They use word accesses if the > > buffers are aligned and only fall back to byte accesses for potentially > > unaligned parts of a buffer. > > > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > > --- > > Changes for V2: > > - Removed also SH4 specific memcpy_fromio code > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler > code for memset and drop it in favor of potentially slower generic C code. What is > the reasoning behind this? See Arnd's feedback on v1 https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com > Also, it seems that this patch would make your other patch > > "sh: Remove memset_io from sh specific code" > > obsolete. Yeah, that should have been mentioned under the ---. Gr{oetje,eeting}s, Geert
Hi Geert, On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote: > Hi Adrian, > > On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz > <glaubitz@physik.fu-berlin.de> wrote: > > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote: > > > Remove IO memcpy and memset from sh specific code and fall back to the > > > new implementations from lib/iomem_copy.c. They use word accesses if the > > > buffers are aligned and only fall back to byte accesses for potentially > > > unaligned parts of a buffer. > > > > > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > > > --- > > > Changes for V2: > > > - Removed also SH4 specific memcpy_fromio code > > > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler > > code for memset and drop it in favor of potentially slower generic C code. What is > > the reasoning behind this? > > See Arnd's feedback on v1 > https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com I read Arnd's feedback but I don't really know whether GCC produces better code than this hand-written assembly. Is there any compelling argument? I'm just worried we would slow down something as fundamental as memset(). > > Also, it seems that this patch would make your other patch > > > > "sh: Remove memset_io from sh specific code" > > > > obsolete. > > Yeah, that should have been mentioned under the ---. Thought so. Adrian
Hi Adrian, On Thu, 30 Jan 2025 at 10:31, John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> wrote: > On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote: > > On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz > > <glaubitz@physik.fu-berlin.de> wrote: > > > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote: > > > > Remove IO memcpy and memset from sh specific code and fall back to the > > > > new implementations from lib/iomem_copy.c. They use word accesses if the > > > > buffers are aligned and only fall back to byte accesses for potentially > > > > unaligned parts of a buffer. > > > > > > > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > > > > --- > > > > Changes for V2: > > > > - Removed also SH4 specific memcpy_fromio code > > > > > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler > > > code for memset and drop it in favor of potentially slower generic C code. What is > > > the reasoning behind this? > > > > See Arnd's feedback on v1 > > https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com > > I read Arnd's feedback but I don't really know whether GCC produces better code than > this hand-written assembly. Is there any compelling argument? > > I'm just worried we would slow down something as fundamental as memset(). it's not memset(), but memset_io(), i.e. clearing (slow) mapped I/O memory. Gr{oetje,eeting}s, Geert
Hi Geert, On Thu, 2025-01-30 at 10:35 +0100, Geert Uytterhoeven wrote: > Hi Adrian, > > On Thu, 30 Jan 2025 at 10:31, John Paul Adrian Glaubitz > <glaubitz@physik.fu-berlin.de> wrote: > > On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote: > > > On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz > > > <glaubitz@physik.fu-berlin.de> wrote: > > > > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote: > > > > > Remove IO memcpy and memset from sh specific code and fall back to the > > > > > new implementations from lib/iomem_copy.c. They use word accesses if the > > > > > buffers are aligned and only fall back to byte accesses for potentially > > > > > unaligned parts of a buffer. > > > > > > > > > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > > > > > --- > > > > > Changes for V2: > > > > > - Removed also SH4 specific memcpy_fromio code > > > > > > > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler > > > > code for memset and drop it in favor of potentially slower generic C code. What is > > > > the reasoning behind this? > > > > > > See Arnd's feedback on v1 > > > https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com > > > > I read Arnd's feedback but I don't really know whether GCC produces better code than > > this hand-written assembly. Is there any compelling argument? > > > > I'm just worried we would slow down something as fundamental as memset(). > > it's not memset(), but memset_io(), i.e. clearing (slow) mapped I/O memory. Could you review the patch as well, so I have some peace of mind? ;-) Thanks, Adrian
Hi Julian, On Tue, 28 Jan 2025 at 16:22, Julian Vetter <julian@outer-limits.org> wrote: > Remove IO memcpy and memset from sh specific code and fall back to the > new implementations from lib/iomem_copy.c. They use word accesses if the > buffers are aligned and only fall back to byte accesses for potentially > unaligned parts of a buffer. > > Signed-off-by: Julian Vetter <julian@outer-limits.org> > --- > Changes for V2: > - Removed also SH4 specific memcpy_fromio code Thanks for the update! > --- a/arch/sh/kernel/io.c > +++ /dev/null > @@ -1,111 +0,0 @@ > -// SPDX-License-Identifier: GPL-2.0 > -/* > - * arch/sh/kernel/io.c - Machine independent I/O functions. > - * > - * Copyright (C) 2000 - 2009 Stuart Menefy > - * Copyright (C) 2005 Paul Mundt > - */ > -#include <linux/module.h> > -#include <linux/pci.h> > -#include <asm/machvec.h> > -#include <asm/io.h> > - > -/* > - * Copy data from IO memory space to "real" memory space. > - */ > -void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) > -{ [...] > - mb(); > -} > -EXPORT_SYMBOL(memcpy_fromio); > - > -/* > - * Copy data from "real" memory space to IO memory space. > - */ > -void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) > -{ [...] > - > - mb(); > -} > -EXPORT_SYMBOL(memcpy_toio); LGTM. My only worry is the removal of the mb() calls, cfr. the scary warning at https://elixir.bootlin.com/linux/v6.13/source/arch/sh/include/asm/barrier.h#L13 Gr{oetje,eeting}s, Geert
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index cf5eab840d57..1e33a1c8b72d 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -268,14 +268,6 @@ __BUILD_IOPORT_STRING(q, u64) #define IO_SPACE_LIMIT 0xffffffff -/* We really want to try and get these to memcpy etc */ -#define memset_io memset_io -#define memcpy_fromio memcpy_fromio -#define memcpy_toio memcpy_toio -void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); -void memcpy_toio(volatile void __iomem *, const void *, unsigned long); -void memset_io(volatile void __iomem *, int, unsigned long); - /* Quad-word real-mode I/O, don't ask.. */ unsigned long long peek_real_address_q(unsigned long long addr); unsigned long long poke_real_address_q(unsigned long long addr, diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c deleted file mode 100644 index da22f3b32d30..000000000000 --- a/arch/sh/kernel/io.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/sh/kernel/io.c - Machine independent I/O functions. - * - * Copyright (C) 2000 - 2009 Stuart Menefy - * Copyright (C) 2005 Paul Mundt - */ -#include <linux/module.h> -#include <linux/pci.h> -#include <asm/machvec.h> -#include <asm/io.h> - -/* - * Copy data from IO memory space to "real" memory space. - */ -void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) -{ - /* - * Would it be worthwhile doing byte and long transfers first - * to try and get aligned? - */ -#ifdef CONFIG_CPU_SH4 - if ((count >= 0x20) && - (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) { - int tmp2, tmp3, tmp4, tmp5, tmp6; - - __asm__ __volatile__( - "1: \n\t" - "mov.l @%7+, r0 \n\t" - "mov.l @%7+, %2 \n\t" - "movca.l r0, @%0 \n\t" - "mov.l @%7+, %3 \n\t" - "mov.l @%7+, %4 \n\t" - "mov.l @%7+, %5 \n\t" - "mov.l @%7+, %6 \n\t" - "mov.l @%7+, r7 \n\t" - "mov.l @%7+, r0 \n\t" - "mov.l %2, @(0x04,%0) \n\t" - "mov #0x20, %2 \n\t" - "mov.l %3, @(0x08,%0) \n\t" - "sub %2, %1 \n\t" - "mov.l %4, @(0x0c,%0) \n\t" - "cmp/hi %1, %2 ! T if 32 > count \n\t" - "mov.l %5, @(0x10,%0) \n\t" - "mov.l %6, @(0x14,%0) \n\t" - "mov.l r7, @(0x18,%0) \n\t" - "mov.l r0, @(0x1c,%0) \n\t" - "bf.s 1b \n\t" - " add #0x20, %0 \n\t" - : "=&r" (to), "=&r" (count), - "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), - "=&r" (tmp5), "=&r" (tmp6), "=&r" (from) - : "7"(from), "0" (to), "1" (count) - : "r0", "r7", "t", "memory"); - } -#endif - - if ((((u32)to | (u32)from) & 0x3) == 0) { - for (; count > 3; count -= 4) { - *(u32 *)to = *(volatile u32 *)from; - to += 4; - from += 4; - } - } - - for (; count > 0; count--) { - *(u8 *)to = *(volatile u8 *)from; - to++; - from++; - } - - mb(); -} -EXPORT_SYMBOL(memcpy_fromio); - -/* - * Copy data from "real" memory space to IO memory space. - */ -void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) -{ - if ((((u32)to | (u32)from) & 0x3) == 0) { - for ( ; count > 3; count -= 4) { - *(volatile u32 *)to = *(u32 *)from; - to += 4; - from += 4; - } - } - - for (; count > 0; count--) { - *(volatile u8 *)to = *(u8 *)from; - to++; - from++; - } - - mb(); -} -EXPORT_SYMBOL(memcpy_toio); - -/* - * "memset" on IO memory space. - * This needs to be optimized. - */ -void memset_io(volatile void __iomem *dst, int c, unsigned long count) -{ - while (count) { - count--; - writeb(c, dst); - dst++; - } -} -EXPORT_SYMBOL(memset_io);
Remove IO memcpy and memset from sh specific code and fall back to the new implementations from lib/iomem_copy.c. They use word accesses if the buffers are aligned and only fall back to byte accesses for potentially unaligned parts of a buffer. Signed-off-by: Julian Vetter <julian@outer-limits.org> --- Changes for V2: - Removed also SH4 specific memcpy_fromio code --- arch/sh/include/asm/io.h | 8 --- arch/sh/kernel/io.c | 111 --------------------------------------- 2 files changed, 119 deletions(-) delete mode 100644 arch/sh/kernel/io.c