diff mbox series

[v2] sh: Remove IO memcpy and memset from sh code

Message ID 20250128101306.1475491-1-julian@outer-limits.org (mailing list archive)
State New
Headers show
Series [v2] sh: Remove IO memcpy and memset from sh code | expand

Commit Message

Julian Vetter Jan. 28, 2025, 10:13 a.m. UTC
Remove IO memcpy and memset from sh specific code and fall back to the
new implementations from lib/iomem_copy.c. They use word accesses if the
buffers are aligned and only fall back to byte accesses for potentially
unaligned parts of a buffer.

Signed-off-by: Julian Vetter <julian@outer-limits.org>
---
Changes for V2:
- Removed also SH4 specific memcpy_fromio code
---
 arch/sh/include/asm/io.h |   8 ---
 arch/sh/kernel/io.c      | 111 ---------------------------------------
 2 files changed, 119 deletions(-)
 delete mode 100644 arch/sh/kernel/io.c

Comments

Arnd Bergmann Jan. 28, 2025, 10:21 a.m. UTC | #1
On Tue, Jan 28, 2025, at 11:13, Julian Vetter wrote:
> Remove IO memcpy and memset from sh specific code and fall back to the
> new implementations from lib/iomem_copy.c. They use word accesses if the
> buffers are aligned and only fall back to byte accesses for potentially
> unaligned parts of a buffer.
>
> Signed-off-by: Julian Vetter <julian@outer-limits.org>
> ---
> Changes for V2:
> - Removed also SH4 specific memcpy_fromio code

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
John Paul Adrian Glaubitz Jan. 30, 2025, 8:44 a.m. UTC | #2
Hi Julian,

On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote:
> Remove IO memcpy and memset from sh specific code and fall back to the
> new implementations from lib/iomem_copy.c. They use word accesses if the
> buffers are aligned and only fall back to byte accesses for potentially
> unaligned parts of a buffer.
> 
> Signed-off-by: Julian Vetter <julian@outer-limits.org>
> ---
> Changes for V2:
> - Removed also SH4 specific memcpy_fromio code
> ---
>  arch/sh/include/asm/io.h |   8 ---
>  arch/sh/kernel/io.c      | 111 ---------------------------------------
>  2 files changed, 119 deletions(-)
>  delete mode 100644 arch/sh/kernel/io.c
> 
> diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
> index cf5eab840d57..1e33a1c8b72d 100644
> --- a/arch/sh/include/asm/io.h
> +++ b/arch/sh/include/asm/io.h
> @@ -268,14 +268,6 @@ __BUILD_IOPORT_STRING(q, u64)
>  
>  #define IO_SPACE_LIMIT 0xffffffff
>  
> -/* We really want to try and get these to memcpy etc */
> -#define memset_io memset_io
> -#define memcpy_fromio memcpy_fromio
> -#define memcpy_toio memcpy_toio
> -void memcpy_fromio(void *, const volatile void __iomem *, unsigned long);
> -void memcpy_toio(volatile void __iomem *, const void *, unsigned long);
> -void memset_io(volatile void __iomem *, int, unsigned long);
> -
>  /* Quad-word real-mode I/O, don't ask.. */
>  unsigned long long peek_real_address_q(unsigned long long addr);
>  unsigned long long poke_real_address_q(unsigned long long addr,
> diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
> deleted file mode 100644
> index da22f3b32d30..000000000000
> --- a/arch/sh/kernel/io.c
> +++ /dev/null
> @@ -1,111 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -/*
> - * arch/sh/kernel/io.c - Machine independent I/O functions.
> - *
> - * Copyright (C) 2000 - 2009  Stuart Menefy
> - * Copyright (C) 2005  Paul Mundt
> - */
> -#include <linux/module.h>
> -#include <linux/pci.h>
> -#include <asm/machvec.h>
> -#include <asm/io.h>
> -
> -/*
> - * Copy data from IO memory space to "real" memory space.
> - */
> -void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count)
> -{
> -	/*
> -	 * Would it be worthwhile doing byte and long transfers first
> -	 * to try and get aligned?
> -	 */
> -#ifdef CONFIG_CPU_SH4
> -	if ((count >= 0x20) &&
> -	     (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) {
> -		int tmp2, tmp3, tmp4, tmp5, tmp6;
> -
> -		__asm__ __volatile__(
> -			"1:			\n\t"
> -			"mov.l	@%7+, r0	\n\t"
> -			"mov.l	@%7+, %2	\n\t"
> -			"movca.l r0, @%0	\n\t"
> -			"mov.l	@%7+, %3	\n\t"
> -			"mov.l	@%7+, %4	\n\t"
> -			"mov.l	@%7+, %5	\n\t"
> -			"mov.l	@%7+, %6	\n\t"
> -			"mov.l	@%7+, r7	\n\t"
> -			"mov.l	@%7+, r0	\n\t"
> -			"mov.l	%2, @(0x04,%0)	\n\t"
> -			"mov	#0x20, %2	\n\t"
> -			"mov.l	%3, @(0x08,%0)	\n\t"
> -			"sub	%2, %1		\n\t"
> -			"mov.l	%4, @(0x0c,%0)	\n\t"
> -			"cmp/hi	%1, %2		! T if 32 > count	\n\t"
> -			"mov.l	%5, @(0x10,%0)	\n\t"
> -			"mov.l	%6, @(0x14,%0)	\n\t"
> -			"mov.l	r7, @(0x18,%0)	\n\t"
> -			"mov.l	r0, @(0x1c,%0)	\n\t"
> -			"bf.s	1b		\n\t"
> -			" add	#0x20, %0	\n\t"
> -			: "=&r" (to), "=&r" (count),
> -			  "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
> -			  "=&r" (tmp5), "=&r" (tmp6), "=&r" (from)
> -			: "7"(from), "0" (to), "1" (count)
> -			: "r0", "r7", "t", "memory");
> -	}
> -#endif
> -
> -	if ((((u32)to | (u32)from) & 0x3) == 0) {
> -		for (; count > 3; count -= 4) {
> -			*(u32 *)to = *(volatile u32 *)from;
> -			to += 4;
> -			from += 4;
> -		}
> -	}
> -
> -	for (; count > 0; count--) {
> -		*(u8 *)to = *(volatile u8 *)from;
> -		to++;
> -		from++;
> -	}
> -
> -	mb();
> -}
> -EXPORT_SYMBOL(memcpy_fromio);
> -
> -/*
> - * Copy data from "real" memory space to IO memory space.
> - */
> -void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count)
> -{
> -	if ((((u32)to | (u32)from) & 0x3) == 0) {
> -		for ( ; count > 3; count -= 4) {
> -			*(volatile u32 *)to = *(u32 *)from;
> -			to += 4;
> -			from += 4;
> -		}
> -	}
> -
> -	for (; count > 0; count--) {
> -		*(volatile u8 *)to = *(u8 *)from;
> -		to++;
> -		from++;
> -	}
> -
> -	mb();
> -}
> -EXPORT_SYMBOL(memcpy_toio);
> -
> -/*
> - * "memset" on IO memory space.
> - * This needs to be optimized.
> - */
> -void memset_io(volatile void __iomem *dst, int c, unsigned long count)
> -{
> -        while (count) {
> -                count--;
> -                writeb(c, dst);
> -                dst++;
> -        }
> -}
> -EXPORT_SYMBOL(memset_io);

I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler
code for memset and drop it in favor of potentially slower generic C code. What is
the reasoning behind this?

Also, it seems that this patch would make your other patch

	"sh: Remove memset_io from sh specific code"

obsolete.

Geert, do you have any opinions on this patch?

Thanks,
Adrian
Geert Uytterhoeven Jan. 30, 2025, 9:13 a.m. UTC | #3
Hi Adrian,

On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz
<glaubitz@physik.fu-berlin.de> wrote:
> On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote:
> > Remove IO memcpy and memset from sh specific code and fall back to the
> > new implementations from lib/iomem_copy.c. They use word accesses if the
> > buffers are aligned and only fall back to byte accesses for potentially
> > unaligned parts of a buffer.
> >
> > Signed-off-by: Julian Vetter <julian@outer-limits.org>
> > ---
> > Changes for V2:
> > - Removed also SH4 specific memcpy_fromio code

> I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler
> code for memset and drop it in favor of potentially slower generic C code. What is
> the reasoning behind this?

See Arnd's feedback on v1
https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com

> Also, it seems that this patch would make your other patch
>
>         "sh: Remove memset_io from sh specific code"
>
> obsolete.

Yeah, that should have been mentioned under the ---.

Gr{oetje,eeting}s,

                        Geert
John Paul Adrian Glaubitz Jan. 30, 2025, 9:31 a.m. UTC | #4
Hi Geert,

On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote:
> Hi Adrian,
> 
> On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz
> <glaubitz@physik.fu-berlin.de> wrote:
> > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote:
> > > Remove IO memcpy and memset from sh specific code and fall back to the
> > > new implementations from lib/iomem_copy.c. They use word accesses if the
> > > buffers are aligned and only fall back to byte accesses for potentially
> > > unaligned parts of a buffer.
> > > 
> > > Signed-off-by: Julian Vetter <julian@outer-limits.org>
> > > ---
> > > Changes for V2:
> > > - Removed also SH4 specific memcpy_fromio code
> 
> > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler
> > code for memset and drop it in favor of potentially slower generic C code. What is
> > the reasoning behind this?
> 
> See Arnd's feedback on v1
> https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com

I read Arnd's feedback but I don't really know whether GCC produces better code than
this hand-written assembly. Is there any compelling argument?

I'm just worried we would slow down something as fundamental as memset().

> > Also, it seems that this patch would make your other patch
> > 
> >         "sh: Remove memset_io from sh specific code"
> > 
> > obsolete.
> 
> Yeah, that should have been mentioned under the ---.

Thought so.

Adrian
Geert Uytterhoeven Jan. 30, 2025, 9:35 a.m. UTC | #5
Hi Adrian,

On Thu, 30 Jan 2025 at 10:31, John Paul Adrian Glaubitz
<glaubitz@physik.fu-berlin.de> wrote:
> On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote:
> > On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz
> > <glaubitz@physik.fu-berlin.de> wrote:
> > > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote:
> > > > Remove IO memcpy and memset from sh specific code and fall back to the
> > > > new implementations from lib/iomem_copy.c. They use word accesses if the
> > > > buffers are aligned and only fall back to byte accesses for potentially
> > > > unaligned parts of a buffer.
> > > >
> > > > Signed-off-by: Julian Vetter <julian@outer-limits.org>
> > > > ---
> > > > Changes for V2:
> > > > - Removed also SH4 specific memcpy_fromio code
> >
> > > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler
> > > code for memset and drop it in favor of potentially slower generic C code. What is
> > > the reasoning behind this?
> >
> > See Arnd's feedback on v1
> > https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com
>
> I read Arnd's feedback but I don't really know whether GCC produces better code than
> this hand-written assembly. Is there any compelling argument?
>
> I'm just worried we would slow down something as fundamental as memset().

it's not memset(), but memset_io(), i.e. clearing (slow) mapped I/O memory.

Gr{oetje,eeting}s,

                        Geert
John Paul Adrian Glaubitz Jan. 30, 2025, 9:52 a.m. UTC | #6
Hi Geert,

On Thu, 2025-01-30 at 10:35 +0100, Geert Uytterhoeven wrote:
> Hi Adrian,
> 
> On Thu, 30 Jan 2025 at 10:31, John Paul Adrian Glaubitz
> <glaubitz@physik.fu-berlin.de> wrote:
> > On Thu, 2025-01-30 at 10:13 +0100, Geert Uytterhoeven wrote:
> > > On Thu, 30 Jan 2025 at 09:44, John Paul Adrian Glaubitz
> > > <glaubitz@physik.fu-berlin.de> wrote:
> > > > On Tue, 2025-01-28 at 11:13 +0100, Julian Vetter wrote:
> > > > > Remove IO memcpy and memset from sh specific code and fall back to the
> > > > > new implementations from lib/iomem_copy.c. They use word accesses if the
> > > > > buffers are aligned and only fall back to byte accesses for potentially
> > > > > unaligned parts of a buffer.
> > > > > 
> > > > > Signed-off-by: Julian Vetter <julian@outer-limits.org>
> > > > > ---
> > > > > Changes for V2:
> > > > > - Removed also SH4 specific memcpy_fromio code
> > > 
> > > > I'm not sure that I understand the motivation to remove hand-optimized sh4 assembler
> > > > code for memset and drop it in favor of potentially slower generic C code. What is
> > > > the reasoning behind this?
> > > 
> > > See Arnd's feedback on v1
> > > https://lore.kernel.org/all/ffe019a1-11b4-4ad7-bbe2-8ef3e01ffeb0@app.fastmail.com
> > 
> > I read Arnd's feedback but I don't really know whether GCC produces better code than
> > this hand-written assembly. Is there any compelling argument?
> > 
> > I'm just worried we would slow down something as fundamental as memset().
> 
> it's not memset(), but memset_io(), i.e. clearing (slow) mapped I/O memory.

Could you review the patch as well, so I have some peace of mind? ;-)

Thanks,
Adrian
Geert Uytterhoeven Jan. 30, 2025, 10:18 a.m. UTC | #7
Hi Julian,

On Tue, 28 Jan 2025 at 16:22, Julian Vetter <julian@outer-limits.org> wrote:
> Remove IO memcpy and memset from sh specific code and fall back to the
> new implementations from lib/iomem_copy.c. They use word accesses if the
> buffers are aligned and only fall back to byte accesses for potentially
> unaligned parts of a buffer.
>
> Signed-off-by: Julian Vetter <julian@outer-limits.org>
> ---
> Changes for V2:
> - Removed also SH4 specific memcpy_fromio code

Thanks for the update!

> --- a/arch/sh/kernel/io.c
> +++ /dev/null
> @@ -1,111 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -/*
> - * arch/sh/kernel/io.c - Machine independent I/O functions.
> - *
> - * Copyright (C) 2000 - 2009  Stuart Menefy
> - * Copyright (C) 2005  Paul Mundt
> - */
> -#include <linux/module.h>
> -#include <linux/pci.h>
> -#include <asm/machvec.h>
> -#include <asm/io.h>
> -
> -/*
> - * Copy data from IO memory space to "real" memory space.
> - */
> -void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count)
> -{

[...]

> -       mb();
> -}
> -EXPORT_SYMBOL(memcpy_fromio);
> -
> -/*
> - * Copy data from "real" memory space to IO memory space.
> - */
> -void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count)
> -{

[...]

> -
> -       mb();
> -}
> -EXPORT_SYMBOL(memcpy_toio);

LGTM. My only worry is the removal of the mb() calls, cfr. the
scary warning at
https://elixir.bootlin.com/linux/v6.13/source/arch/sh/include/asm/barrier.h#L13

Gr{oetje,eeting}s,

                        Geert
diff mbox series

Patch

diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index cf5eab840d57..1e33a1c8b72d 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -268,14 +268,6 @@  __BUILD_IOPORT_STRING(q, u64)
 
 #define IO_SPACE_LIMIT 0xffffffff
 
-/* We really want to try and get these to memcpy etc */
-#define memset_io memset_io
-#define memcpy_fromio memcpy_fromio
-#define memcpy_toio memcpy_toio
-void memcpy_fromio(void *, const volatile void __iomem *, unsigned long);
-void memcpy_toio(volatile void __iomem *, const void *, unsigned long);
-void memset_io(volatile void __iomem *, int, unsigned long);
-
 /* Quad-word real-mode I/O, don't ask.. */
 unsigned long long peek_real_address_q(unsigned long long addr);
 unsigned long long poke_real_address_q(unsigned long long addr,
diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
deleted file mode 100644
index da22f3b32d30..000000000000
--- a/arch/sh/kernel/io.c
+++ /dev/null
@@ -1,111 +0,0 @@ 
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/sh/kernel/io.c - Machine independent I/O functions.
- *
- * Copyright (C) 2000 - 2009  Stuart Menefy
- * Copyright (C) 2005  Paul Mundt
- */
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <asm/machvec.h>
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count)
-{
-	/*
-	 * Would it be worthwhile doing byte and long transfers first
-	 * to try and get aligned?
-	 */
-#ifdef CONFIG_CPU_SH4
-	if ((count >= 0x20) &&
-	     (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) {
-		int tmp2, tmp3, tmp4, tmp5, tmp6;
-
-		__asm__ __volatile__(
-			"1:			\n\t"
-			"mov.l	@%7+, r0	\n\t"
-			"mov.l	@%7+, %2	\n\t"
-			"movca.l r0, @%0	\n\t"
-			"mov.l	@%7+, %3	\n\t"
-			"mov.l	@%7+, %4	\n\t"
-			"mov.l	@%7+, %5	\n\t"
-			"mov.l	@%7+, %6	\n\t"
-			"mov.l	@%7+, r7	\n\t"
-			"mov.l	@%7+, r0	\n\t"
-			"mov.l	%2, @(0x04,%0)	\n\t"
-			"mov	#0x20, %2	\n\t"
-			"mov.l	%3, @(0x08,%0)	\n\t"
-			"sub	%2, %1		\n\t"
-			"mov.l	%4, @(0x0c,%0)	\n\t"
-			"cmp/hi	%1, %2		! T if 32 > count	\n\t"
-			"mov.l	%5, @(0x10,%0)	\n\t"
-			"mov.l	%6, @(0x14,%0)	\n\t"
-			"mov.l	r7, @(0x18,%0)	\n\t"
-			"mov.l	r0, @(0x1c,%0)	\n\t"
-			"bf.s	1b		\n\t"
-			" add	#0x20, %0	\n\t"
-			: "=&r" (to), "=&r" (count),
-			  "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
-			  "=&r" (tmp5), "=&r" (tmp6), "=&r" (from)
-			: "7"(from), "0" (to), "1" (count)
-			: "r0", "r7", "t", "memory");
-	}
-#endif
-
-	if ((((u32)to | (u32)from) & 0x3) == 0) {
-		for (; count > 3; count -= 4) {
-			*(u32 *)to = *(volatile u32 *)from;
-			to += 4;
-			from += 4;
-		}
-	}
-
-	for (; count > 0; count--) {
-		*(u8 *)to = *(volatile u8 *)from;
-		to++;
-		from++;
-	}
-
-	mb();
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count)
-{
-	if ((((u32)to | (u32)from) & 0x3) == 0) {
-		for ( ; count > 3; count -= 4) {
-			*(volatile u32 *)to = *(u32 *)from;
-			to += 4;
-			from += 4;
-		}
-	}
-
-	for (; count > 0; count--) {
-		*(volatile u8 *)to = *(u8 *)from;
-		to++;
-		from++;
-	}
-
-	mb();
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, unsigned long count)
-{
-        while (count) {
-                count--;
-                writeb(c, dst);
-                dst++;
-        }
-}
-EXPORT_SYMBOL(memset_io);