Message ID | 1470983123-22127-19-git-send-email-akash.goel@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 12/08/16 07:25, akash.goel@intel.com wrote: > From: Chris Wilson <chris@chris-wilson.co.uk> > > This patch provides the infrastructure for performing a 16-byte aligned > read from WC memory using non-temporal instructions introduced with sse4.1. > Using movntdqa we can bypass the CPU caches and read directly from memory > and ignoring the page attributes set on the CPU PTE i.e. negating the > impact of an otherwise UC access. Copying using movntqda from WC is almost > as fast as reading from WB memory, modulo the possibility of both hitting > the CPU cache or leaving the data in the CPU cache for the next consumer. > (The CPU cache itself my be flushed for the region of the movntdqa and on > later access the movntdqa reads from a separate internal buffer for the > cacheline.) The write back to the memory is however cached. > > This will be used in later patches to accelerate accessing WC memory. > > v2: Report whether the accelerated copy is successful/possible. > v3: Function alignment override was only necessary when using the > function target("sse4.1") - which is not necessary for emitting movntdqa > from __asm__. > v4: Improve notes on CPU cache behaviour vs non-temporal stores. > v5: Fix byte offsets for unrolled moves. > v6: Find all remaining typos of movntqda, use kernel_fpu_begin. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Akash Goel <akash.goel@intel.com> > Cc: Damien Lespiau <damien.lespiau@intel.com> > Cc: Mika Kuoppala <mika.kuoppala@intel.com> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/Makefile | 3 ++ > drivers/gpu/drm/i915/i915_drv.c | 2 + > drivers/gpu/drm/i915/i915_drv.h | 3 ++ > drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++ > 4 files changed, 109 insertions(+) > create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index dda724f..3412413 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -3,12 +3,15 @@ > # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher. > > subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror > +subdir-ccflags-y += \ > + $(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA) > > # Please keep these build lists sorted! > > # core driver code > i915-y := i915_drv.o \ > i915_irq.o \ > + i915_memcpy.o \ > i915_params.o \ > i915_pci.o \ > i915_suspend.o \ > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index cb8c943..4bbf0af 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -841,6 +841,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv, > mutex_init(&dev_priv->wm.wm_mutex); > mutex_init(&dev_priv->pps_mutex); > > + i915_memcpy_init_early(dev_priv); > + > ret = i915_workqueues_init(dev_priv); > if (ret < 0) > return ret; > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 6603812..fca09ea 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3909,4 +3909,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req) > return false; > } > > +void i915_memcpy_init_early(struct drm_i915_private *dev_priv); > +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); > + > #endif > diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c > new file mode 100644 > index 0000000..50fc579 > --- /dev/null > +++ b/drivers/gpu/drm/i915/i915_memcpy.c > @@ -0,0 +1,101 @@ > +/* > + * Copyright © 2016 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + * > + */ > + > +#include <linux/kernel.h> > +#include <asm/fpu/api.h> > + > +#include "i915_drv.h" > + > +DEFINE_STATIC_KEY_FALSE(has_movntdqa); > + > +#ifdef CONFIG_AS_MOVNTDQA > +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) > +{ > + kernel_fpu_begin(); > + > + len >>= 4; > + while (len >= 4) { > + asm("movntdqa (%0), %%xmm0\n" > + "movntdqa 16(%0), %%xmm1\n" > + "movntdqa 32(%0), %%xmm2\n" > + "movntdqa 48(%0), %%xmm3\n" > + "movaps %%xmm0, (%1)\n" > + "movaps %%xmm1, 16(%1)\n" > + "movaps %%xmm2, 32(%1)\n" > + "movaps %%xmm3, 48(%1)\n" > + :: "r" (src), "r" (dst) : "memory"); > + src += 64; > + dst += 64; > + len -= 4; > + } > + while (len--) { > + asm("movntdqa (%0), %%xmm0\n" > + "movaps %%xmm0, (%1)\n" > + :: "r" (src), "r" (dst) : "memory"); > + src += 16; > + dst += 16; > + } > + > + kernel_fpu_end(); > +} > +#endif > + > +/** > + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC > + * @dst: destination pointer > + * @src: source pointer > + * @len: how many bytes to copy > + * > + * i915_memcpy_from_wc copies @len bytes from @src to @dst using > + * non-temporal instructions where available. Note that all arguments > + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple > + * of 16. > + * > + * To test whether accelerated reads from WC are supported, use > + * i915_memcpy_from_wc(NULL, NULL, 0); > + * > + * Returns true if the copy was successful, false if the preconditions > + * are not met. > + */ > +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) > +{ > + if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) > + return false; > + > +#ifdef CONFIG_AS_MOVNTDQA > + if (static_branch_likely(&has_movntdqa)) { > + if (len) Potentially could annotate this with another likely. > + __memcpy_ntdqa(dst, src, len); > + return true; > + } > +#endif > + > + return false; > +} > + > +void i915_memcpy_init_early(struct drm_i915_private *dev_priv) > +{ > + if (static_cpu_has(X86_FEATURE_XMM4_1)) > + static_branch_enable(&has_movntdqa); > +} > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
On Fri, Aug 12, 2016 at 11:54:04AM +0100, Tvrtko Ursulin wrote: > On 12/08/16 07:25, akash.goel@intel.com wrote: > >From: Chris Wilson <chris@chris-wilson.co.uk> > > > >This patch provides the infrastructure for performing a 16-byte aligned > >read from WC memory using non-temporal instructions introduced with sse4.1. > >Using movntdqa we can bypass the CPU caches and read directly from memory > >and ignoring the page attributes set on the CPU PTE i.e. negating the > >impact of an otherwise UC access. Copying using movntqda from WC is almost > >as fast as reading from WB memory, modulo the possibility of both hitting > >the CPU cache or leaving the data in the CPU cache for the next consumer. > >(The CPU cache itself my be flushed for the region of the movntdqa and on > >later access the movntdqa reads from a separate internal buffer for the > >cacheline.) The write back to the memory is however cached. > > > >This will be used in later patches to accelerate accessing WC memory. > > > >v2: Report whether the accelerated copy is successful/possible. > >v3: Function alignment override was only necessary when using the > >function target("sse4.1") - which is not necessary for emitting movntdqa > >from __asm__. > >v4: Improve notes on CPU cache behaviour vs non-temporal stores. > >v5: Fix byte offsets for unrolled moves. > >v6: Find all remaining typos of movntqda, use kernel_fpu_begin. > > > >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > >Cc: Akash Goel <akash.goel@intel.com> > >Cc: Damien Lespiau <damien.lespiau@intel.com> > >Cc: Mika Kuoppala <mika.kuoppala@intel.com> > >Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Picked up the 2 WC prep patches. Thanks for the review, testing and improvements, -Chris
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index dda724f..3412413 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -3,12 +3,15 @@ # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher. subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror +subdir-ccflags-y += \ + $(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA) # Please keep these build lists sorted! # core driver code i915-y := i915_drv.o \ i915_irq.o \ + i915_memcpy.o \ i915_params.o \ i915_pci.o \ i915_suspend.o \ diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index cb8c943..4bbf0af 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -841,6 +841,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv, mutex_init(&dev_priv->wm.wm_mutex); mutex_init(&dev_priv->pps_mutex); + i915_memcpy_init_early(dev_priv); + ret = i915_workqueues_init(dev_priv); if (ret < 0) return ret; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6603812..fca09ea 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3909,4 +3909,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req) return false; } +void i915_memcpy_init_early(struct drm_i915_private *dev_priv); +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); + #endif diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c new file mode 100644 index 0000000..50fc579 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_memcpy.c @@ -0,0 +1,101 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <linux/kernel.h> +#include <asm/fpu/api.h> + +#include "i915_drv.h" + +DEFINE_STATIC_KEY_FALSE(has_movntdqa); + +#ifdef CONFIG_AS_MOVNTDQA +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) +{ + kernel_fpu_begin(); + + len >>= 4; + while (len >= 4) { + asm("movntdqa (%0), %%xmm0\n" + "movntdqa 16(%0), %%xmm1\n" + "movntdqa 32(%0), %%xmm2\n" + "movntdqa 48(%0), %%xmm3\n" + "movaps %%xmm0, (%1)\n" + "movaps %%xmm1, 16(%1)\n" + "movaps %%xmm2, 32(%1)\n" + "movaps %%xmm3, 48(%1)\n" + :: "r" (src), "r" (dst) : "memory"); + src += 64; + dst += 64; + len -= 4; + } + while (len--) { + asm("movntdqa (%0), %%xmm0\n" + "movaps %%xmm0, (%1)\n" + :: "r" (src), "r" (dst) : "memory"); + src += 16; + dst += 16; + } + + kernel_fpu_end(); +} +#endif + +/** + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC + * @dst: destination pointer + * @src: source pointer + * @len: how many bytes to copy + * + * i915_memcpy_from_wc copies @len bytes from @src to @dst using + * non-temporal instructions where available. Note that all arguments + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple + * of 16. + * + * To test whether accelerated reads from WC are supported, use + * i915_memcpy_from_wc(NULL, NULL, 0); + * + * Returns true if the copy was successful, false if the preconditions + * are not met. + */ +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) +{ + if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) + return false; + +#ifdef CONFIG_AS_MOVNTDQA + if (static_branch_likely(&has_movntdqa)) { + if (len) + __memcpy_ntdqa(dst, src, len); + return true; + } +#endif + + return false; +} + +void i915_memcpy_init_early(struct drm_i915_private *dev_priv) +{ + if (static_cpu_has(X86_FEATURE_XMM4_1)) + static_branch_enable(&has_movntdqa); +}