diff mbox

[02/11] drm/i915: Avoid forcing relocations through the mappable GTT or CPU

Message ID 1357642399-7678-3-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Jan. 8, 2013, 10:53 a.m. UTC
If the object lies outside of the mappable GTT aperture, do not force it
through the CPU domain for relocations, but simply flush the writes as
we perform them and then queue a chipset flush.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   87 ++++++++++++++++------------
 1 file changed, 51 insertions(+), 36 deletions(-)

Comments

Imre Deak Jan. 14, 2013, 7:21 p.m. UTC | #1
On Tue, 2013-01-08 at 10:53 +0000, Chris Wilson wrote:
> If the object lies outside of the mappable GTT aperture, do not force it
> through the CPU domain for relocations, but simply flush the writes as
> we perform them and then queue a chipset flush.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   87 ++++++++++++++++------------
>  1 file changed, 51 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 163bb52..daa5409 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -33,6 +33,9 @@
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
>  
> +#define __EXEC_OBJECT_HAS_PIN (1<<31)
> +#define __EXEC_OBJECT_HAS_FENCE (1<<30)
> +
>  struct eb_objects {
>  	int and;
>  	struct hlist_head buckets[0];
> @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb)
>  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  {
>  	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> -		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> +static inline struct page *
> +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset)
> +{
> +	offset -= obj->gtt_space->start;
> +	return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT);
> +}
> +
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_objects *eb,
> @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  		return -EFAULT;
>  
>  	reloc->delta += target_offset;
> +	reloc->offset += obj->gtt_offset;
>  	if (use_cpu_reloc(obj)) {
> -		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
>  		char *vaddr;
>  
> -		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
> +		ret = i915_gem_object_set_to_cpu_domain(obj, true);
>  		if (ret)
>  			return ret;
>  
> -		vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> -							     reloc->offset >> PAGE_SHIFT));
> -		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +		vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +		*(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta;
>  		kunmap_atomic(vaddr);
>  	} else {
>  		struct drm_i915_private *dev_priv = dev->dev_private;
> -		uint32_t __iomem *reloc_entry;
> -		void __iomem *reloc_page;
> +		unsigned page_offset;
>  
>  		ret = i915_gem_object_set_to_gtt_domain(obj, true);
>  		if (ret)
> @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  			return ret;
>  
>  		/* Map the page containing the relocation we're going to perform.  */
> -		reloc->offset += obj->gtt_offset;
> -		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> -						      reloc->offset & PAGE_MASK);
> -		reloc_entry = (uint32_t __iomem *)
> -			(reloc_page + (reloc->offset & ~PAGE_MASK));
> -		iowrite32(reloc->delta, reloc_entry);
> -		io_mapping_unmap_atomic(reloc_page);
> +		page_offset = offset_in_page(reloc->offset);
> +
> +		if (reloc->offset < dev_priv->mm.gtt_mappable_end) {
> +			void __iomem *reloc_page;
> +
> +			reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> +							      reloc->offset & PAGE_MASK);
> +			iowrite32(reloc->delta, reloc_page + page_offset);
> +			io_mapping_unmap_atomic(reloc_page);
> +		} else {
> +			char *vaddr;
> +
> +			vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +
> +			drm_clflush_virt_range(vaddr + page_offset, 4);
> +			*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +			drm_clflush_virt_range(vaddr + page_offset, 4);

Discussed this already to some degree with Chris, but I still think the
first cache flush is redundant.

> +
> +			kunmap_atomic(vaddr);
> +
> +			obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU;
> +		}
>  	}
>  
>  	/* and update the user's relocation entry */
> @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
>  	return ret;
>  }
>  
> -#define  __EXEC_OBJECT_HAS_PIN (1<<31)
> -#define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> -
> -static int
> -need_reloc_mappable(struct drm_i915_gem_object *obj)
> -{
> -	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -	return entry->relocation_count && !use_cpu_reloc(obj);
> -}
> -
>  static int
>  i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  				   struct intel_ring_buffer *ring)
> @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
>  	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
>  	bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
> -	bool need_fence, need_mappable;
> +	bool need_fence;
>  	int ret;
>  
>  	need_fence =
>  		has_fenced_gpu_access &&
>  		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  		obj->tiling_mode != I915_TILING_NONE;
> -	need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false);
> +	ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false);
>  	if (ret)
>  		return ret;
>  
> @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  	INIT_LIST_HEAD(&ordered_objects);
>  	while (!list_empty(objects)) {
>  		struct drm_i915_gem_exec_object2 *entry;
> -		bool need_fence, need_mappable;
> +		bool need_fence;
>  
>  		obj = list_first_entry(objects,
>  				       struct drm_i915_gem_object,
> @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  			has_fenced_gpu_access &&
>  			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  			obj->tiling_mode != I915_TILING_NONE;
> -		need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -		if (need_mappable)
> +		if (need_fence)
>  			list_move(&obj->exec_list, &ordered_objects);
>  		else
>  			list_move_tail(&obj->exec_list, &ordered_objects);
> @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  		/* Unbind any ill-fitting objects or pin. */
>  		list_for_each_entry(obj, objects, exec_list) {
>  			struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -			bool need_fence, need_mappable;
> +			bool need_fence;
>  
>  			if (!obj->gtt_space)
>  				continue;
> @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  				has_fenced_gpu_access &&
>  				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  				obj->tiling_mode != I915_TILING_NONE;
> -			need_mappable = need_fence || need_reloc_mappable(obj);
>  
>  			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
> -			    (need_mappable && !obj->map_and_fenceable))
> +			    (need_fence && !obj->map_and_fenceable))
>  				ret = i915_gem_object_unbind(obj);
>  			else
>  				ret = i915_gem_execbuffer_reserve_object(obj, ring);
> @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
>  		if (ret)
>  			return ret;
>  
> +		flush_domains |= obj->base.write_domain;
> +
>  		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
>  			i915_gem_clflush_object(obj);
>  
> -		flush_domains |= obj->base.write_domain;

Looks like an unnecessary move.

> +		/* Used as an internal marker during relocation processing */
> +		if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) {
> +			flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS;
> +			obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS;
> +		}
>  	}
>  
>  	if (flush_domains & I915_GEM_DOMAIN_CPU)
> @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	}
>  
>  	/* Set the pending read domains for the batch buffer to COMMAND */
> -	if (batch_obj->base.pending_write_domain) {
> +	if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) {
>  		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
>  		ret = -EINVAL;
>  		goto err;
Daniel Vetter Jan. 14, 2013, 8:04 p.m. UTC | #2
On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote:
>> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
>> +                     *(uint32_t *)(vaddr + page_offset) = reloc->delta;
>> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
>
> Discussed this already to some degree with Chris, but I still think the
> first cache flush is redundant.

Nope, since it's a partial cacheline write, we need to first flush out
any stale data, then write the dword (which loads the latest data from
memory into a cacheline). Then we need to flush the updated cacheline
out into main memory again. Iirc I've mentioned this somewhere in the
part 4 of my gem intro.
-Daniel
Chris Wilson Jan. 14, 2013, 8:50 p.m. UTC | #3
On Mon, 14 Jan 2013 21:04:06 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote:
> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
> >> +                     *(uint32_t *)(vaddr + page_offset) = reloc->delta;
> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
> >
> > Discussed this already to some degree with Chris, but I still think the
> > first cache flush is redundant.
> 
> Nope, since it's a partial cacheline write, we need to first flush out
> any stale data, then write the dword (which loads the latest data from
> memory into a cacheline). Then we need to flush the updated cacheline
> out into main memory again. Iirc I've mentioned this somewhere in the
> part 4 of my gem intro.

The question was whether there could be any stale data in that aliased
cacheline, and whether or not the code was a continuation of a cargo
cult. My answer was that it was our working knowledge that that flush
is crucial to unconfuse the CPU. In the absence of a definitive
reference for the mysteries of clflush, we should write one.
-Chris
Daniel Vetter Jan. 14, 2013, 8:53 p.m. UTC | #4
On Mon, Jan 14, 2013 at 9:50 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Mon, 14 Jan 2013 21:04:06 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
>> On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote:
>> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
>> >> +                     *(uint32_t *)(vaddr + page_offset) = reloc->delta;
>> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
>> >
>> > Discussed this already to some degree with Chris, but I still think the
>> > first cache flush is redundant.
>>
>> Nope, since it's a partial cacheline write, we need to first flush out
>> any stale data, then write the dword (which loads the latest data from
>> memory into a cacheline). Then we need to flush the updated cacheline
>> out into main memory again. Iirc I've mentioned this somewhere in the
>> part 4 of my gem intro.
>
> The question was whether there could be any stale data in that aliased
> cacheline, and whether or not the code was a continuation of a cargo
> cult. My answer was that it was our working knowledge that that flush
> is crucial to unconfuse the CPU. In the absence of a definitive
> reference for the mysteries of clflush, we should write one.

Iirc for pwrite, the clflush _before_ a partial cacheline write is
crucial for correctness. At least I remember writing tests for it,
which successfully blow up if you drop that flush ;-)
-Daniel
Imre Deak Jan. 14, 2013, 9:06 p.m. UTC | #5
On Mon, 2013-01-14 at 21:04 +0100, Daniel Vetter wrote:
> On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote:
> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
> >> +                     *(uint32_t *)(vaddr + page_offset) = reloc->delta;
> >> +                     drm_clflush_virt_range(vaddr + page_offset, 4);
> >
> > Discussed this already to some degree with Chris, but I still think the
> > first cache flush is redundant.
> 
> Nope, since it's a partial cacheline write, we need to first flush out
> any stale data, then write the dword (which loads the latest data from
> memory into a cacheline). Then we need to flush the updated cacheline
> out into main memory again. 

Well, the first flush would first write any valid data in the cache line
to memory and only then invalidate it. But this would have the same
result as just doing away with the first flush: the original cache line
updated by the dword.

And if the cache line was invalid the first flush is a no-op and the
write would reload the cache line from memory as you pointed out.

--Imre
Daniel Vetter Jan. 14, 2013, 9:51 p.m. UTC | #6
On Mon, Jan 14, 2013 at 10:06 PM, Imre Deak <imre.deak@intel.com> wrote:
> Well, the first flush would first write any valid data in the cache line
> to memory and only then invalidate it. But this would have the same
> result as just doing away with the first flush: the original cache line
> updated by the dword.
>
> And if the cache line was invalid the first flush is a no-op and the
> write would reload the cache line from memory as you pointed out.

The thing is that coherency doesn't work that way if the other side
doesn't send out snoop notices:
1. cpu reads the cacheline into it's cache. Note that prefetch is good
enough for this.
2. gpu writes new values to the same location in memory, which updates
the main memory, but doesn't change anything in the cpu cache state.
3. cpu writes that dword, and updates it's cacheline.
4. clflush writes that cacheline out to main memory.

Note that the gpu write in 2 is now overwritten.

I admit that it's really hard to come up with a real-world scenario
involving relocations (since usually it's the cpu which has last
written the batch, with the gpu only reading it in between). But for
pwrite it's mandatory for correctness, so I don't want to take any
changes.
-Daniel
Imre Deak Jan. 14, 2013, 11:02 p.m. UTC | #7
On Mon, 2013-01-14 at 22:51 +0100, Daniel Vetter wrote:
> On Mon, Jan 14, 2013 at 10:06 PM, Imre Deak <imre.deak@intel.com> wrote:
> > Well, the first flush would first write any valid data in the cache line
> > to memory and only then invalidate it. But this would have the same
> > result as just doing away with the first flush: the original cache line
> > updated by the dword.
> >
> > And if the cache line was invalid the first flush is a no-op and the
> > write would reload the cache line from memory as you pointed out.
> 
> The thing is that coherency doesn't work that way if the other side
> doesn't send out snoop notices:
> 1. cpu reads the cacheline into it's cache. Note that prefetch is good
> enough for this.
> 2. gpu writes new values to the same location in memory, which updates
> the main memory, but doesn't change anything in the cpu cache state.
> 3. cpu writes that dword, and updates it's cacheline.
> 4. clflush writes that cacheline out to main memory.
> 
> Note that the gpu write in 2 is now overwritten.
> 
> I admit that it's really hard to come up with a real-world scenario
> involving relocations (since usually it's the cpu which has last
> written the batch, with the gpu only reading it in between). But for
> pwrite it's mandatory for correctness, so I don't want to take any
> changes.

Ok, I didn't think about GPU side writes for these buffers, but I guess
it's possible. Thanks for the explanation.

--Imre
Daniel Vetter Jan. 16, 2013, 4:07 p.m. UTC | #8
On Tue, Jan 08, 2013 at 10:53:10AM +0000, Chris Wilson wrote:
> If the object lies outside of the mappable GTT aperture, do not force it
> through the CPU domain for relocations, but simply flush the writes as
> we perform them and then queue a chipset flush.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

A few bikesheds and comments on this one here. First the general ones:
- How much does this hurt when we're unlucky and a batch ends up in
  unmappable? Since we do unconditional clflushes and don't batch them up,
  I except copywin to go down the toilet a bit ...
- Do we need mitigation ala pwrite, where we try to move objects into
  mappable if it's possible without stalls?
- Performance data is missing ;-)

> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   87 ++++++++++++++++------------
>  1 file changed, 51 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 163bb52..daa5409 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -33,6 +33,9 @@
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
>  
> +#define __EXEC_OBJECT_HAS_PIN (1<<31)
> +#define __EXEC_OBJECT_HAS_FENCE (1<<30)
> +
>  struct eb_objects {
>  	int and;
>  	struct hlist_head buckets[0];
> @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb)
>  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  {
>  	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> -		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> +static inline struct page *
> +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset)
> +{
> +	offset -= obj->gtt_space->start;
> +	return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT);
> +}
> +
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_objects *eb,
> @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  		return -EFAULT;
>  
>  	reloc->delta += target_offset;
> +	reloc->offset += obj->gtt_offset;

Imo this adjustment here leads to messier code - we only really want the
gtt offset for the gtt relocs. With this we then have to substract the gtt
offset again in gtt_offset_to_page, which will be even more fun if we ever
get real ppgtt. I think a gtt_reloc_offset temp var instead and leaving
reloc->offset as would look better ...

>  	if (use_cpu_reloc(obj)) {
> -		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
>  		char *vaddr;
>  
> -		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
> +		ret = i915_gem_object_set_to_cpu_domain(obj, true);
>  		if (ret)
>  			return ret;
>  
> -		vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> -							     reloc->offset >> PAGE_SHIFT));
> -		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +		vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +		*(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta;
>  		kunmap_atomic(vaddr);
>  	} else {
>  		struct drm_i915_private *dev_priv = dev->dev_private;
> -		uint32_t __iomem *reloc_entry;
> -		void __iomem *reloc_page;
> +		unsigned page_offset;
>  
>  		ret = i915_gem_object_set_to_gtt_domain(obj, true);
>  		if (ret)
> @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  			return ret;
>  
>  		/* Map the page containing the relocation we're going to perform.  */
> -		reloc->offset += obj->gtt_offset;
> -		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> -						      reloc->offset & PAGE_MASK);
> -		reloc_entry = (uint32_t __iomem *)
> -			(reloc_page + (reloc->offset & ~PAGE_MASK));
> -		iowrite32(reloc->delta, reloc_entry);
> -		io_mapping_unmap_atomic(reloc_page);
> +		page_offset = offset_in_page(reloc->offset);
> +
> +		if (reloc->offset < dev_priv->mm.gtt_mappable_end) {
> +			void __iomem *reloc_page;
> +
> +			reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> +							      reloc->offset & PAGE_MASK);
> +			iowrite32(reloc->delta, reloc_page + page_offset);
> +			io_mapping_unmap_atomic(reloc_page);

WARN_ON(!obj->has_global_gtt_mapping); for paranoia, see the reasoning for
it below ...

> +		} else {
> +			char *vaddr;
> +
> +			vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +
> +			drm_clflush_virt_range(vaddr + page_offset, 4);
> +			*(uint32_t *)(vaddr + page_offset) = reloc->delta;
> +			drm_clflush_virt_range(vaddr + page_offset, 4);
> +
> +			kunmap_atomic(vaddr);
> +
> +			obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU;
> +		}
>  	}
>  
>  	/* and update the user's relocation entry */
> @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
>  	return ret;
>  }
>  
> -#define  __EXEC_OBJECT_HAS_PIN (1<<31)
> -#define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> -
> -static int
> -need_reloc_mappable(struct drm_i915_gem_object *obj)
> -{
> -	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -	return entry->relocation_count && !use_cpu_reloc(obj);

We take the && !use_cpu_reloc to make sure that when we need do gtt
relocs, the global gtt mapping is in place. Luckily we don't have any
platfroms currently where this matters, but if e.g. vlv enables ppgtt, we
have one. So we need to handle this somehow (or risk the warth of Jesse
when he tries to figure out why ppgtt is broken exactly on vlv).

> -}
> -
>  static int
>  i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  				   struct intel_ring_buffer *ring)
> @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
>  	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
>  	bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
> -	bool need_fence, need_mappable;
> +	bool need_fence;
>  	int ret;
>  
>  	need_fence =
>  		has_fenced_gpu_access &&
>  		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  		obj->tiling_mode != I915_TILING_NONE;
> -	need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false);
> +	ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false);
>  	if (ret)
>  		return ret;
>  
> @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  	INIT_LIST_HEAD(&ordered_objects);
>  	while (!list_empty(objects)) {
>  		struct drm_i915_gem_exec_object2 *entry;
> -		bool need_fence, need_mappable;
> +		bool need_fence;
>  
>  		obj = list_first_entry(objects,
>  				       struct drm_i915_gem_object,
> @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  			has_fenced_gpu_access &&
>  			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  			obj->tiling_mode != I915_TILING_NONE;
> -		need_mappable = need_fence || need_reloc_mappable(obj);
>  
> -		if (need_mappable)
> +		if (need_fence)
>  			list_move(&obj->exec_list, &ordered_objects);
>  		else
>  			list_move_tail(&obj->exec_list, &ordered_objects);
> @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  		/* Unbind any ill-fitting objects or pin. */
>  		list_for_each_entry(obj, objects, exec_list) {
>  			struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> -			bool need_fence, need_mappable;
> +			bool need_fence;
>  
>  			if (!obj->gtt_space)
>  				continue;
> @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
>  				has_fenced_gpu_access &&
>  				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
>  				obj->tiling_mode != I915_TILING_NONE;
> -			need_mappable = need_fence || need_reloc_mappable(obj);
>  
>  			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
> -			    (need_mappable && !obj->map_and_fenceable))
> +			    (need_fence && !obj->map_and_fenceable))
>  				ret = i915_gem_object_unbind(obj);
>  			else
>  				ret = i915_gem_execbuffer_reserve_object(obj, ring);
> @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
>  		if (ret)
>  			return ret;
>  
> +		flush_domains |= obj->base.write_domain;
> +
>  		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
>  			i915_gem_clflush_object(obj);
>  
> -		flush_domains |= obj->base.write_domain;

Can we just drop this?

> +		/* Used as an internal marker during relocation processing */
> +		if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) {
> +			flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS;
> +			obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS;
> +		}

We /should/ filter out !GPU_DOMAINS already with return -EINVAL, so why
the changes here? Or can we just drop them?

>  	}
>  
>  	if (flush_domains & I915_GEM_DOMAIN_CPU)
> @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	}
>  
>  	/* Set the pending read domains for the batch buffer to COMMAND */
> -	if (batch_obj->base.pending_write_domain) {
> +	if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) {

dito.

Cheers, Daniel

>  		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
>  		ret = -EINVAL;
>  		goto err;
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 163bb52..daa5409 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -33,6 +33,9 @@ 
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
 
+#define __EXEC_OBJECT_HAS_PIN (1<<31)
+#define __EXEC_OBJECT_HAS_FENCE (1<<30)
+
 struct eb_objects {
 	int and;
 	struct hlist_head buckets[0];
@@ -95,10 +98,16 @@  eb_destroy(struct eb_objects *eb)
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 {
 	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
-		!obj->map_and_fenceable ||
 		obj->cache_level != I915_CACHE_NONE);
 }
 
+static inline struct page *
+gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset)
+{
+	offset -= obj->gtt_space->start;
+	return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT);
+}
+
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_objects *eb,
@@ -182,22 +191,20 @@  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return -EFAULT;
 
 	reloc->delta += target_offset;
+	reloc->offset += obj->gtt_offset;
 	if (use_cpu_reloc(obj)) {
-		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
 		char *vaddr;
 
-		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
+		ret = i915_gem_object_set_to_cpu_domain(obj, true);
 		if (ret)
 			return ret;
 
-		vaddr = kmap_atomic(i915_gem_object_get_page(obj,
-							     reloc->offset >> PAGE_SHIFT));
-		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
+		vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
+		*(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta;
 		kunmap_atomic(vaddr);
 	} else {
 		struct drm_i915_private *dev_priv = dev->dev_private;
-		uint32_t __iomem *reloc_entry;
-		void __iomem *reloc_page;
+		unsigned page_offset;
 
 		ret = i915_gem_object_set_to_gtt_domain(obj, true);
 		if (ret)
@@ -208,13 +215,28 @@  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 			return ret;
 
 		/* Map the page containing the relocation we're going to perform.  */
-		reloc->offset += obj->gtt_offset;
-		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
-						      reloc->offset & PAGE_MASK);
-		reloc_entry = (uint32_t __iomem *)
-			(reloc_page + (reloc->offset & ~PAGE_MASK));
-		iowrite32(reloc->delta, reloc_entry);
-		io_mapping_unmap_atomic(reloc_page);
+		page_offset = offset_in_page(reloc->offset);
+
+		if (reloc->offset < dev_priv->mm.gtt_mappable_end) {
+			void __iomem *reloc_page;
+
+			reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
+							      reloc->offset & PAGE_MASK);
+			iowrite32(reloc->delta, reloc_page + page_offset);
+			io_mapping_unmap_atomic(reloc_page);
+		} else {
+			char *vaddr;
+
+			vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
+
+			drm_clflush_virt_range(vaddr + page_offset, 4);
+			*(uint32_t *)(vaddr + page_offset) = reloc->delta;
+			drm_clflush_virt_range(vaddr + page_offset, 4);
+
+			kunmap_atomic(vaddr);
+
+			obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU;
+		}
 	}
 
 	/* and update the user's relocation entry */
@@ -312,16 +334,6 @@  i915_gem_execbuffer_relocate(struct drm_device *dev,
 	return ret;
 }
 
-#define  __EXEC_OBJECT_HAS_PIN (1<<31)
-#define  __EXEC_OBJECT_HAS_FENCE (1<<30)
-
-static int
-need_reloc_mappable(struct drm_i915_gem_object *obj)
-{
-	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
-	return entry->relocation_count && !use_cpu_reloc(obj);
-}
-
 static int
 i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 				   struct intel_ring_buffer *ring)
@@ -329,16 +341,15 @@  i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
 	bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
-	bool need_fence, need_mappable;
+	bool need_fence;
 	int ret;
 
 	need_fence =
 		has_fenced_gpu_access &&
 		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 		obj->tiling_mode != I915_TILING_NONE;
-	need_mappable = need_fence || need_reloc_mappable(obj);
 
-	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false);
+	ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false);
 	if (ret)
 		return ret;
 
@@ -401,7 +412,7 @@  i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 	INIT_LIST_HEAD(&ordered_objects);
 	while (!list_empty(objects)) {
 		struct drm_i915_gem_exec_object2 *entry;
-		bool need_fence, need_mappable;
+		bool need_fence;
 
 		obj = list_first_entry(objects,
 				       struct drm_i915_gem_object,
@@ -412,9 +423,8 @@  i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 			has_fenced_gpu_access &&
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 			obj->tiling_mode != I915_TILING_NONE;
-		need_mappable = need_fence || need_reloc_mappable(obj);
 
-		if (need_mappable)
+		if (need_fence)
 			list_move(&obj->exec_list, &ordered_objects);
 		else
 			list_move_tail(&obj->exec_list, &ordered_objects);
@@ -444,7 +454,7 @@  i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 		/* Unbind any ill-fitting objects or pin. */
 		list_for_each_entry(obj, objects, exec_list) {
 			struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
-			bool need_fence, need_mappable;
+			bool need_fence;
 
 			if (!obj->gtt_space)
 				continue;
@@ -453,10 +463,9 @@  i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 				has_fenced_gpu_access &&
 				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 				obj->tiling_mode != I915_TILING_NONE;
-			need_mappable = need_fence || need_reloc_mappable(obj);
 
 			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
-			    (need_mappable && !obj->map_and_fenceable))
+			    (need_fence && !obj->map_and_fenceable))
 				ret = i915_gem_object_unbind(obj);
 			else
 				ret = i915_gem_execbuffer_reserve_object(obj, ring);
@@ -603,10 +612,16 @@  i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 		if (ret)
 			return ret;
 
+		flush_domains |= obj->base.write_domain;
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			i915_gem_clflush_object(obj);
 
-		flush_domains |= obj->base.write_domain;
+		/* Used as an internal marker during relocation processing */
+		if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) {
+			flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS;
+			obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS;
+		}
 	}
 
 	if (flush_domains & I915_GEM_DOMAIN_CPU)
@@ -923,7 +938,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (batch_obj->base.pending_write_domain) {
+	if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
 		goto err;