diff mbox

[v2,3/6] drm/i915: Use WC copies on !llc platforms for the command parser

Message ID 1448016961-25331-4-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Nov. 20, 2015, 10:55 a.m. UTC
Since we blow the TLB caches by using kmap/kunmap, we may as well go the
whole hog and see if declaring our destination page as WC is faster than
keeping it as WB and using clflush. It should be!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Comments

Ville Syrjälä Nov. 20, 2015, 3:05 p.m. UTC | #1
On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote:
> Since we blow the TLB caches by using kmap/kunmap, we may as well go the
> whole hog and see if declaring our destination page as WC is faster than
> keeping it as WB and using clflush. It should be!

Is this description for another patch? I can't see any WC stuff in
there.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index c6f6d9f2b2ce..4a3e90b042c5 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -992,9 +992,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  	const struct drm_i915_cmd_descriptor *desc = &default_desc;
>  	u32 last_cmd_header = 0;
>  	unsigned dst_iter, src_iter;
> -	int needs_clflush = 0;
>  	struct get_page rewind;
>  	void *src, *dst, *tmp;
> +	int src_needs_clflush = 0;
> +	bool dst_needs_clflush;
>  	u32 partial, length = 1;
>  	unsigned in, out;
>  	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
> @@ -1007,13 +1008,19 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
>  		return -ENODEV;
>  
> -	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> +	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &src_needs_clflush);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
>  		return ret;
>  	}
>  
> -	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> +	dst_needs_clflush =
> +		shadow_batch_obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
> +		!INTEL_INFO(shadow_batch_obj->base.dev)->has_llc;
> +	if (dst_needs_clflush)
> +		ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj, true);
> +	else
> +		ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
>  		goto unpin;
> @@ -1048,7 +1055,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  			this = PAGE_SIZE - in;
>  
>  		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> -		if (needs_clflush)
> +		if (src_needs_clflush)
>  			drm_clflush_virt_range(src + in, this);
>  
>  		if (this == PAGE_SIZE && partial == 0)
> @@ -1151,6 +1158,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  				int len;
>  
>  				if (out == PAGE_SIZE) {
> +					if (dst_needs_clflush)
> +						drm_clflush_virt_range(dst, PAGE_SIZE);
>  					kunmap_atomic(dst);
>  					dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
>  					out = 0;
> @@ -1179,6 +1188,8 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  		kunmap_atomic(src);
>  		in = 0;
>  	}
> +	if (dst_needs_clflush)
> +		drm_clflush_virt_range(dst, out);
>  unmap:
>  	kunmap_atomic(src);
>  	kunmap_atomic(dst);
> -- 
> 2.6.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Nov. 20, 2015, 3:22 p.m. UTC | #2
On Fri, Nov 20, 2015 at 05:05:05PM +0200, Ville Syrjälä wrote:
> On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote:
> > Since we blow the TLB caches by using kmap/kunmap, we may as well go the
> > whole hog and see if declaring our destination page as WC is faster than
> > keeping it as WB and using clflush. It should be!
> 
> Is this description for another patch? I can't see any WC stuff in
> there.

No, just badly written. (Admittedly at one point I did experiment with
remapping the pages as WC and discovered that did stop_macheine()...)

drm/i915: Perform inline clflushes from the command parser

On incoherent architectures, we can avoid having to clflush the
destination shadow batch as a separate pass by inlining the call to
clflush whilst we already have the kmap_atomic() around for the page.
-Chris
Ville Syrjälä Dec. 1, 2015, 5:32 p.m. UTC | #3
On Fri, Nov 20, 2015 at 03:22:03PM +0000, Chris Wilson wrote:
> On Fri, Nov 20, 2015 at 05:05:05PM +0200, Ville Syrjälä wrote:
> > On Fri, Nov 20, 2015 at 10:55:58AM +0000, Chris Wilson wrote:
> > > Since we blow the TLB caches by using kmap/kunmap, we may as well go the
> > > whole hog and see if declaring our destination page as WC is faster than
> > > keeping it as WB and using clflush. It should be!
> > 
> > Is this description for another patch? I can't see any WC stuff in
> > there.
> 
> No, just badly written. (Admittedly at one point I did experiment with
> remapping the pages as WC and discovered that did stop_macheine()...)
> 
> drm/i915: Perform inline clflushes from the command parser
> 
> On incoherent architectures, we can avoid having to clflush the
> destination shadow batch as a separate pass by inlining the call to
> clflush whilst we already have the kmap_atomic() around for the page.

With the new description:
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index c6f6d9f2b2ce..4a3e90b042c5 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -992,9 +992,10 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 	const struct drm_i915_cmd_descriptor *desc = &default_desc;
 	u32 last_cmd_header = 0;
 	unsigned dst_iter, src_iter;
-	int needs_clflush = 0;
 	struct get_page rewind;
 	void *src, *dst, *tmp;
+	int src_needs_clflush = 0;
+	bool dst_needs_clflush;
 	u32 partial, length = 1;
 	unsigned in, out;
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
@@ -1007,13 +1008,19 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
 		return -ENODEV;
 
-	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &src_needs_clflush);
 	if (ret) {
 		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
 		return ret;
 	}
 
-	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+	dst_needs_clflush =
+		shadow_batch_obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
+		!INTEL_INFO(shadow_batch_obj->base.dev)->has_llc;
+	if (dst_needs_clflush)
+		ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj, true);
+	else
+		ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
 	if (ret) {
 		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
 		goto unpin;
@@ -1048,7 +1055,7 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 			this = PAGE_SIZE - in;
 
 		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
-		if (needs_clflush)
+		if (src_needs_clflush)
 			drm_clflush_virt_range(src + in, this);
 
 		if (this == PAGE_SIZE && partial == 0)
@@ -1151,6 +1158,8 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 				int len;
 
 				if (out == PAGE_SIZE) {
+					if (dst_needs_clflush)
+						drm_clflush_virt_range(dst, PAGE_SIZE);
 					kunmap_atomic(dst);
 					dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
 					out = 0;
@@ -1179,6 +1188,8 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		kunmap_atomic(src);
 		in = 0;
 	}
+	if (dst_needs_clflush)
+		drm_clflush_virt_range(dst, out);
 unmap:
 	kunmap_atomic(src);
 	kunmap_atomic(dst);