diff mbox series

[1/7] drm: Relax alignment constraint for destination address

Message ID 20220222145206.76118-2-balasubramani.vivekanandan@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Use the memcpy_from_wc function from drm | expand

Commit Message

Vivekanandan, Balasubramani Feb. 22, 2022, 2:52 p.m. UTC
There is no need for the destination address to be aligned to 16 byte
boundary to be able to use the non-temporal instructions while copying.
Non-temporal instructions are used only for loading from the source
address which has alignment constraints.
We only need to take care of using the right instructions, based on
whether destination address is aligned or not, while storing the data to
the destination address.

__memcpy_ntdqu is copied from i915/i915_memcpy.c

Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Chris Wilson <chris.p.wilson@intel.com>

Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan@intel.com>
---
 drivers/gpu/drm/drm_cache.c | 44 ++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

Comments

Lucas De Marchi March 1, 2022, 7:28 a.m. UTC | #1
On Tue, Feb 22, 2022 at 08:22:00PM +0530, Balasubramani Vivekanandan wrote:
>There is no need for the destination address to be aligned to 16 byte
>boundary to be able to use the non-temporal instructions while copying.
>Non-temporal instructions are used only for loading from the source
>address which has alignment constraints.
>We only need to take care of using the right instructions, based on
>whether destination address is aligned or not, while storing the data to
>the destination address.
>
>__memcpy_ntdqu is copied from i915/i915_memcpy.c
>
>Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>Cc: Maxime Ripard <mripard@kernel.org>
>Cc: Thomas Zimmermann <tzimmermann@suse.de>
>Cc: David Airlie <airlied@linux.ie>
>Cc: Daniel Vetter <daniel@ffwll.ch>
>Cc: Chris Wilson <chris.p.wilson@intel.com>
>
>Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan@intel.com>
>---
> drivers/gpu/drm/drm_cache.c | 44 ++++++++++++++++++++++++++++++++-----
> 1 file changed, 38 insertions(+), 6 deletions(-)
>
>diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>index c3e6e615bf09..a21c1350eb09 100644
>--- a/drivers/gpu/drm/drm_cache.c
>+++ b/drivers/gpu/drm/drm_cache.c
>@@ -278,18 +278,50 @@ static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
> 	kernel_fpu_end();
> }
>
>+static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
>+{
>+	kernel_fpu_begin();
>+
>+	while (len >= 4) {
>+		asm("movntdqa   (%0), %%xmm0\n"
>+		    "movntdqa 16(%0), %%xmm1\n"
>+		    "movntdqa 32(%0), %%xmm2\n"
>+		    "movntdqa 48(%0), %%xmm3\n"
>+		    "movups %%xmm0,   (%1)\n"
>+		    "movups %%xmm1, 16(%1)\n"
>+		    "movups %%xmm2, 32(%1)\n"
>+		    "movups %%xmm3, 48(%1)\n"
>+		    :: "r" (src), "r" (dst) : "memory");
>+		src += 64;
>+		dst += 64;
>+		len -= 4;
>+	}
>+	while (len--) {
>+		asm("movntdqa (%0), %%xmm0\n"
>+		    "movups %%xmm0, (%1)\n"
>+		    :: "r" (src), "r" (dst) : "memory");
>+		src += 16;
>+		dst += 16;

ok, this takes care of the tail

>+	}
>+
>+	kernel_fpu_end();
>+}
>+
> /*
>  * __drm_memcpy_from_wc copies @len bytes from @src to @dst using
>- * non-temporal instructions where available. Note that all arguments
>- * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
>- * of 16.
>+ * non-temporal instructions where available. Note that @src must be aligned to
>+ * 16 bytes and @len must be a multiple of 16.
>  */
> static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> {
>-	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
>+	if (unlikely(((unsigned long)src | len) & 15)) {
> 		memcpy(dst, src, len);
>-	else if (likely(len))
>-		__memcpy_ntdqa(dst, src, len >> 4);
>+	} else if (likely(len)) {
>+		if (IS_ALIGNED((unsigned long)dst, 16))

we may want to just extend this function to deal with dst not being
aligned. But this may be done on top


Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>


Lucas De Marchi

>+			__memcpy_ntdqa(dst, src, len >> 4);
>+		else
>+			__memcpy_ntdqu(dst, src, len >> 4);
>+	}
> }
>
> /**
>-- 
>2.25.1
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index c3e6e615bf09..a21c1350eb09 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -278,18 +278,50 @@  static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
 	kernel_fpu_end();
 }
 
+static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
+{
+	kernel_fpu_begin();
+
+	while (len >= 4) {
+		asm("movntdqa   (%0), %%xmm0\n"
+		    "movntdqa 16(%0), %%xmm1\n"
+		    "movntdqa 32(%0), %%xmm2\n"
+		    "movntdqa 48(%0), %%xmm3\n"
+		    "movups %%xmm0,   (%1)\n"
+		    "movups %%xmm1, 16(%1)\n"
+		    "movups %%xmm2, 32(%1)\n"
+		    "movups %%xmm3, 48(%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		asm("movntdqa (%0), %%xmm0\n"
+		    "movups %%xmm0, (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+
+	kernel_fpu_end();
+}
+
 /*
  * __drm_memcpy_from_wc copies @len bytes from @src to @dst using
- * non-temporal instructions where available. Note that all arguments
- * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
- * of 16.
+ * non-temporal instructions where available. Note that @src must be aligned to
+ * 16 bytes and @len must be a multiple of 16.
  */
 static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len)
 {
-	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
+	if (unlikely(((unsigned long)src | len) & 15)) {
 		memcpy(dst, src, len);
-	else if (likely(len))
-		__memcpy_ntdqa(dst, src, len >> 4);
+	} else if (likely(len)) {
+		if (IS_ALIGNED((unsigned long)dst, 16))
+			__memcpy_ntdqa(dst, src, len >> 4);
+		else
+			__memcpy_ntdqu(dst, src, len >> 4);
+	}
 }
 
 /**