Message ID | 1448016961-25331-4-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote: > Since we blow the TLB caches by using kmap/kunmap, we may as well go the > whole hog and see if declaring our destination page as WC is faster than > keeping it as WB and using clflush. It should be! Is this description for another patch? I can't see any WC stuff in there. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 19 +++++++++++++++---- > 1 file changed, 15 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index c6f6d9f2b2ce..4a3e90b042c5 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -992,9 +992,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring, > const struct drm_i915_cmd_descriptor *desc = &default_desc; > u32 last_cmd_header = 0; > unsigned dst_iter, src_iter; > - int needs_clflush = 0; > struct get_page rewind; > void *src, *dst, *tmp; > + int src_needs_clflush = 0; > + bool dst_needs_clflush; > u32 partial, length = 1; > unsigned in, out; > bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */ > @@ -1007,13 +1008,19 @@ int i915_parse_cmds(struct intel_engine_cs *ring, > if (WARN_ON(shadow_batch_obj->pages_pin_count == 0)) > return -ENODEV; > > - ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush); > + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &src_needs_clflush); > if (ret) { > DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n"); > return ret; > } > > - ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true); > + dst_needs_clflush = > + shadow_batch_obj->base.write_domain != I915_GEM_DOMAIN_CPU && > + !INTEL_INFO(shadow_batch_obj->base.dev)->has_llc; > + if (dst_needs_clflush) > + ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj, true); > + else > + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true); > if (ret) { > DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n"); > goto unpin; > @@ -1048,7 +1055,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring, > this = PAGE_SIZE - in; > > src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter)); > - if (needs_clflush) > + if (src_needs_clflush) > drm_clflush_virt_range(src + in, this); > > if (this == PAGE_SIZE && partial == 0) > @@ -1151,6 +1158,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring, > int len; > > if (out == PAGE_SIZE) { > + if (dst_needs_clflush) > + drm_clflush_virt_range(dst, PAGE_SIZE); > kunmap_atomic(dst); > dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter)); > out = 0; > @@ -1179,6 +1188,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring, > kunmap_atomic(src); > in = 0; > } > + if (dst_needs_clflush) > + drm_clflush_virt_range(dst, out); > unmap: > kunmap_atomic(src); > kunmap_atomic(dst); > -- > 2.6.2 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Fri, Nov 20, 2015 at 05:05:05PM +0200, Ville Syrjälä wrote: > On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote: > > Since we blow the TLB caches by using kmap/kunmap, we may as well go the > > whole hog and see if declaring our destination page as WC is faster than > > keeping it as WB and using clflush. It should be! > > Is this description for another patch? I can't see any WC stuff in > there. No, just badly written. (Admittedly at one point I did experiment with remapping the pages as WC and discovered that did stop_macheine()...) drm/i915: Perform inline clflushes from the command parser On incoherent architectures, we can avoid having to clflush the destination shadow batch as a separate pass by inlining the call to clflush whilst we already have the kmap_atomic() around for the page. -Chris
On Fri, Nov 20, 2015 at 03:22:03PM +0000, Chris Wilson wrote: > On Fri, Nov 20, 2015 at 05:05:05PM +0200, Ville Syrjälä wrote: > > On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote: > > > Since we blow the TLB caches by using kmap/kunmap, we may as well go the > > > whole hog and see if declaring our destination page as WC is faster than > > > keeping it as WB and using clflush. It should be! > > > > Is this description for another patch? I can't see any WC stuff in > > there. > > No, just badly written. (Admittedly at one point I did experiment with > remapping the pages as WC and discovered that did stop_macheine()...) > > drm/i915: Perform inline clflushes from the command parser > > On incoherent architectures, we can avoid having to clflush the > destination shadow batch as a separate pass by inlining the call to > clflush whilst we already have the kmap_atomic() around for the page. With the new description: Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index c6f6d9f2b2ce..4a3e90b042c5 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -992,9 +992,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring, const struct drm_i915_cmd_descriptor *desc = &default_desc; u32 last_cmd_header = 0; unsigned dst_iter, src_iter; - int needs_clflush = 0; struct get_page rewind; void *src, *dst, *tmp; + int src_needs_clflush = 0; + bool dst_needs_clflush; u32 partial, length = 1; unsigned in, out; bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */ @@ -1007,13 +1008,19 @@ int i915_parse_cmds(struct intel_engine_cs *ring, if (WARN_ON(shadow_batch_obj->pages_pin_count == 0)) return -ENODEV; - ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush); + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &src_needs_clflush); if (ret) { DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n"); return ret; } - ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true); + dst_needs_clflush = + shadow_batch_obj->base.write_domain != I915_GEM_DOMAIN_CPU && + !INTEL_INFO(shadow_batch_obj->base.dev)->has_llc; + if (dst_needs_clflush) + ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj, true); + else + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true); if (ret) { DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n"); goto unpin; @@ -1048,7 +1055,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring, this = PAGE_SIZE - in; src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter)); - if (needs_clflush) + if (src_needs_clflush) drm_clflush_virt_range(src + in, this); if (this == PAGE_SIZE && partial == 0) @@ -1151,6 +1158,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring, int len; if (out == PAGE_SIZE) { + if (dst_needs_clflush) + drm_clflush_virt_range(dst, PAGE_SIZE); kunmap_atomic(dst); dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter)); out = 0; @@ -1179,6 +1188,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring, kunmap_atomic(src); in = 0; } + if (dst_needs_clflush) + drm_clflush_virt_range(dst, out); unmap: kunmap_atomic(src); kunmap_atomic(dst);
Since we blow the TLB caches by using kmap/kunmap, we may as well go the whole hog and see if declaring our destination page as WC is faster than keeping it as WB and using clflush. It should be! Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_cmd_parser.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-)