Message ID | 1357642399-7678-3-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, 2013-01-08 at 10:53 +0000, Chris Wilson wrote: > If the object lies outside of the mappable GTT aperture, do not force it > through the CPU domain for relocations, but simply flush the writes as > we perform them and then queue a chipset flush. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++------------ > 1 file changed, 51 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 163bb52..daa5409 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -33,6 +33,9 @@ > #include "intel_drv.h" > #include <linux/dma_remapping.h> > > +#define __EXEC_OBJECT_HAS_PIN (1<<31) > +#define __EXEC_OBJECT_HAS_FENCE (1<<30) > + > struct eb_objects { > int and; > struct hlist_head buckets[0]; > @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb) > static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > { > return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || > - !obj->map_and_fenceable || > obj->cache_level != I915_CACHE_NONE); > } > > +static inline struct page * > +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset) > +{ > + offset -= obj->gtt_space->start; > + return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT); > +} > + > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_objects *eb, > @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return -EFAULT; > > reloc->delta += target_offset; > + reloc->offset += obj->gtt_offset; > if (use_cpu_reloc(obj)) { > - uint32_t page_offset = reloc->offset & ~PAGE_MASK; > char *vaddr; > > - ret = i915_gem_object_set_to_cpu_domain(obj, 1); > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > if (ret) > return ret; > > - vaddr = kmap_atomic(i915_gem_object_get_page(obj, > - reloc->offset >> PAGE_SHIFT)); > - *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + *(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta; > kunmap_atomic(vaddr); > } else { > struct drm_i915_private *dev_priv = dev->dev_private; > - uint32_t __iomem *reloc_entry; > - void __iomem *reloc_page; > + unsigned page_offset; > > ret = i915_gem_object_set_to_gtt_domain(obj, true); > if (ret) > @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return ret; > > /* Map the page containing the relocation we're going to perform. */ > - reloc->offset += obj->gtt_offset; > - reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > - reloc->offset & PAGE_MASK); > - reloc_entry = (uint32_t __iomem *) > - (reloc_page + (reloc->offset & ~PAGE_MASK)); > - iowrite32(reloc->delta, reloc_entry); > - io_mapping_unmap_atomic(reloc_page); > + page_offset = offset_in_page(reloc->offset); > + > + if (reloc->offset < dev_priv->mm.gtt_mappable_end) { > + void __iomem *reloc_page; > + > + reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > + reloc->offset & PAGE_MASK); > + iowrite32(reloc->delta, reloc_page + page_offset); > + io_mapping_unmap_atomic(reloc_page); > + } else { > + char *vaddr; > + > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + > + drm_clflush_virt_range(vaddr + page_offset, 4); > + *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + drm_clflush_virt_range(vaddr + page_offset, 4); Discussed this already to some degree with Chris, but I still think the first cache flush is redundant. > + > + kunmap_atomic(vaddr); > + > + obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU; > + } > } > > /* and update the user's relocation entry */ > @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, > return ret; > } > > -#define __EXEC_OBJECT_HAS_PIN (1<<31) > -#define __EXEC_OBJECT_HAS_FENCE (1<<30) > - > -static int > -need_reloc_mappable(struct drm_i915_gem_object *obj) > -{ > - struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - return entry->relocation_count && !use_cpu_reloc(obj); > -} > - > static int > i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct intel_ring_buffer *ring) > @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct drm_i915_private *dev_priv = obj->base.dev->dev_private; > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; > - bool need_fence, need_mappable; > + bool need_fence; > int ret; > > need_fence = > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false); > + ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false); > if (ret) > return ret; > > @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > INIT_LIST_HEAD(&ordered_objects); > while (!list_empty(objects)) { > struct drm_i915_gem_exec_object2 *entry; > - bool need_fence, need_mappable; > + bool need_fence; > > obj = list_first_entry(objects, > struct drm_i915_gem_object, > @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - if (need_mappable) > + if (need_fence) > list_move(&obj->exec_list, &ordered_objects); > else > list_move_tail(&obj->exec_list, &ordered_objects); > @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > /* Unbind any ill-fitting objects or pin. */ > list_for_each_entry(obj, objects, exec_list) { > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - bool need_fence, need_mappable; > + bool need_fence; > > if (!obj->gtt_space) > continue; > @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) || > - (need_mappable && !obj->map_and_fenceable)) > + (need_fence && !obj->map_and_fenceable)) > ret = i915_gem_object_unbind(obj); > else > ret = i915_gem_execbuffer_reserve_object(obj, ring); > @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, > if (ret) > return ret; > > + flush_domains |= obj->base.write_domain; > + > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) > i915_gem_clflush_object(obj); > > - flush_domains |= obj->base.write_domain; Looks like an unnecessary move. > + /* Used as an internal marker during relocation processing */ > + if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) { > + flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS; > + obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS; > + } > } > > if (flush_domains & I915_GEM_DOMAIN_CPU) > @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > } > > /* Set the pending read domains for the batch buffer to COMMAND */ > - if (batch_obj->base.pending_write_domain) { > + if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) { > DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); > ret = -EINVAL; > goto err;
On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote: >> + drm_clflush_virt_range(vaddr + page_offset, 4); >> + *(uint32_t *)(vaddr + page_offset) = reloc->delta; >> + drm_clflush_virt_range(vaddr + page_offset, 4); > > Discussed this already to some degree with Chris, but I still think the > first cache flush is redundant. Nope, since it's a partial cacheline write, we need to first flush out any stale data, then write the dword (which loads the latest data from memory into a cacheline). Then we need to flush the updated cacheline out into main memory again. Iirc I've mentioned this somewhere in the part 4 of my gem intro. -Daniel
On Mon, 14 Jan 2013 21:04:06 +0100, Daniel Vetter <daniel@ffwll.ch> wrote: > On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote: > >> + drm_clflush_virt_range(vaddr + page_offset, 4); > >> + *(uint32_t *)(vaddr + page_offset) = reloc->delta; > >> + drm_clflush_virt_range(vaddr + page_offset, 4); > > > > Discussed this already to some degree with Chris, but I still think the > > first cache flush is redundant. > > Nope, since it's a partial cacheline write, we need to first flush out > any stale data, then write the dword (which loads the latest data from > memory into a cacheline). Then we need to flush the updated cacheline > out into main memory again. Iirc I've mentioned this somewhere in the > part 4 of my gem intro. The question was whether there could be any stale data in that aliased cacheline, and whether or not the code was a continuation of a cargo cult. My answer was that it was our working knowledge that that flush is crucial to unconfuse the CPU. In the absence of a definitive reference for the mysteries of clflush, we should write one. -Chris
On Mon, Jan 14, 2013 at 9:50 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote: > On Mon, 14 Jan 2013 21:04:06 +0100, Daniel Vetter <daniel@ffwll.ch> wrote: >> On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote: >> >> + drm_clflush_virt_range(vaddr + page_offset, 4); >> >> + *(uint32_t *)(vaddr + page_offset) = reloc->delta; >> >> + drm_clflush_virt_range(vaddr + page_offset, 4); >> > >> > Discussed this already to some degree with Chris, but I still think the >> > first cache flush is redundant. >> >> Nope, since it's a partial cacheline write, we need to first flush out >> any stale data, then write the dword (which loads the latest data from >> memory into a cacheline). Then we need to flush the updated cacheline >> out into main memory again. Iirc I've mentioned this somewhere in the >> part 4 of my gem intro. > > The question was whether there could be any stale data in that aliased > cacheline, and whether or not the code was a continuation of a cargo > cult. My answer was that it was our working knowledge that that flush > is crucial to unconfuse the CPU. In the absence of a definitive > reference for the mysteries of clflush, we should write one. Iirc for pwrite, the clflush _before_ a partial cacheline write is crucial for correctness. At least I remember writing tests for it, which successfully blow up if you drop that flush ;-) -Daniel
On Mon, 2013-01-14 at 21:04 +0100, Daniel Vetter wrote: > On Mon, Jan 14, 2013 at 8:21 PM, Imre Deak <imre.deak@intel.com> wrote: > >> + drm_clflush_virt_range(vaddr + page_offset, 4); > >> + *(uint32_t *)(vaddr + page_offset) = reloc->delta; > >> + drm_clflush_virt_range(vaddr + page_offset, 4); > > > > Discussed this already to some degree with Chris, but I still think the > > first cache flush is redundant. > > Nope, since it's a partial cacheline write, we need to first flush out > any stale data, then write the dword (which loads the latest data from > memory into a cacheline). Then we need to flush the updated cacheline > out into main memory again. Well, the first flush would first write any valid data in the cache line to memory and only then invalidate it. But this would have the same result as just doing away with the first flush: the original cache line updated by the dword. And if the cache line was invalid the first flush is a no-op and the write would reload the cache line from memory as you pointed out. --Imre
On Mon, Jan 14, 2013 at 10:06 PM, Imre Deak <imre.deak@intel.com> wrote: > Well, the first flush would first write any valid data in the cache line > to memory and only then invalidate it. But this would have the same > result as just doing away with the first flush: the original cache line > updated by the dword. > > And if the cache line was invalid the first flush is a no-op and the > write would reload the cache line from memory as you pointed out. The thing is that coherency doesn't work that way if the other side doesn't send out snoop notices: 1. cpu reads the cacheline into it's cache. Note that prefetch is good enough for this. 2. gpu writes new values to the same location in memory, which updates the main memory, but doesn't change anything in the cpu cache state. 3. cpu writes that dword, and updates it's cacheline. 4. clflush writes that cacheline out to main memory. Note that the gpu write in 2 is now overwritten. I admit that it's really hard to come up with a real-world scenario involving relocations (since usually it's the cpu which has last written the batch, with the gpu only reading it in between). But for pwrite it's mandatory for correctness, so I don't want to take any changes. -Daniel
On Mon, 2013-01-14 at 22:51 +0100, Daniel Vetter wrote: > On Mon, Jan 14, 2013 at 10:06 PM, Imre Deak <imre.deak@intel.com> wrote: > > Well, the first flush would first write any valid data in the cache line > > to memory and only then invalidate it. But this would have the same > > result as just doing away with the first flush: the original cache line > > updated by the dword. > > > > And if the cache line was invalid the first flush is a no-op and the > > write would reload the cache line from memory as you pointed out. > > The thing is that coherency doesn't work that way if the other side > doesn't send out snoop notices: > 1. cpu reads the cacheline into it's cache. Note that prefetch is good > enough for this. > 2. gpu writes new values to the same location in memory, which updates > the main memory, but doesn't change anything in the cpu cache state. > 3. cpu writes that dword, and updates it's cacheline. > 4. clflush writes that cacheline out to main memory. > > Note that the gpu write in 2 is now overwritten. > > I admit that it's really hard to come up with a real-world scenario > involving relocations (since usually it's the cpu which has last > written the batch, with the gpu only reading it in between). But for > pwrite it's mandatory for correctness, so I don't want to take any > changes. Ok, I didn't think about GPU side writes for these buffers, but I guess it's possible. Thanks for the explanation. --Imre
On Tue, Jan 08, 2013 at 10:53:10AM +0000, Chris Wilson wrote: > If the object lies outside of the mappable GTT aperture, do not force it > through the CPU domain for relocations, but simply flush the writes as > we perform them and then queue a chipset flush. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> A few bikesheds and comments on this one here. First the general ones: - How much does this hurt when we're unlucky and a batch ends up in unmappable? Since we do unconditional clflushes and don't batch them up, I except copywin to go down the toilet a bit ... - Do we need mitigation ala pwrite, where we try to move objects into mappable if it's possible without stalls? - Performance data is missing ;-) > --- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++------------ > 1 file changed, 51 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 163bb52..daa5409 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -33,6 +33,9 @@ > #include "intel_drv.h" > #include <linux/dma_remapping.h> > > +#define __EXEC_OBJECT_HAS_PIN (1<<31) > +#define __EXEC_OBJECT_HAS_FENCE (1<<30) > + > struct eb_objects { > int and; > struct hlist_head buckets[0]; > @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb) > static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > { > return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || > - !obj->map_and_fenceable || > obj->cache_level != I915_CACHE_NONE); > } > > +static inline struct page * > +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset) > +{ > + offset -= obj->gtt_space->start; > + return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT); > +} > + > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_objects *eb, > @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return -EFAULT; > > reloc->delta += target_offset; > + reloc->offset += obj->gtt_offset; Imo this adjustment here leads to messier code - we only really want the gtt offset for the gtt relocs. With this we then have to substract the gtt offset again in gtt_offset_to_page, which will be even more fun if we ever get real ppgtt. I think a gtt_reloc_offset temp var instead and leaving reloc->offset as would look better ... > if (use_cpu_reloc(obj)) { > - uint32_t page_offset = reloc->offset & ~PAGE_MASK; > char *vaddr; > > - ret = i915_gem_object_set_to_cpu_domain(obj, 1); > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > if (ret) > return ret; > > - vaddr = kmap_atomic(i915_gem_object_get_page(obj, > - reloc->offset >> PAGE_SHIFT)); > - *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + *(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta; > kunmap_atomic(vaddr); > } else { > struct drm_i915_private *dev_priv = dev->dev_private; > - uint32_t __iomem *reloc_entry; > - void __iomem *reloc_page; > + unsigned page_offset; > > ret = i915_gem_object_set_to_gtt_domain(obj, true); > if (ret) > @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > return ret; > > /* Map the page containing the relocation we're going to perform. */ > - reloc->offset += obj->gtt_offset; > - reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > - reloc->offset & PAGE_MASK); > - reloc_entry = (uint32_t __iomem *) > - (reloc_page + (reloc->offset & ~PAGE_MASK)); > - iowrite32(reloc->delta, reloc_entry); > - io_mapping_unmap_atomic(reloc_page); > + page_offset = offset_in_page(reloc->offset); > + > + if (reloc->offset < dev_priv->mm.gtt_mappable_end) { > + void __iomem *reloc_page; > + > + reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, > + reloc->offset & PAGE_MASK); > + iowrite32(reloc->delta, reloc_page + page_offset); > + io_mapping_unmap_atomic(reloc_page); WARN_ON(!obj->has_global_gtt_mapping); for paranoia, see the reasoning for it below ... > + } else { > + char *vaddr; > + > + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); > + > + drm_clflush_virt_range(vaddr + page_offset, 4); > + *(uint32_t *)(vaddr + page_offset) = reloc->delta; > + drm_clflush_virt_range(vaddr + page_offset, 4); > + > + kunmap_atomic(vaddr); > + > + obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU; > + } > } > > /* and update the user's relocation entry */ > @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, > return ret; > } > > -#define __EXEC_OBJECT_HAS_PIN (1<<31) > -#define __EXEC_OBJECT_HAS_FENCE (1<<30) > - > -static int > -need_reloc_mappable(struct drm_i915_gem_object *obj) > -{ > - struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - return entry->relocation_count && !use_cpu_reloc(obj); We take the && !use_cpu_reloc to make sure that when we need do gtt relocs, the global gtt mapping is in place. Luckily we don't have any platfroms currently where this matters, but if e.g. vlv enables ppgtt, we have one. So we need to handle this somehow (or risk the warth of Jesse when he tries to figure out why ppgtt is broken exactly on vlv). > -} > - > static int > i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct intel_ring_buffer *ring) > @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, > struct drm_i915_private *dev_priv = obj->base.dev->dev_private; > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; > - bool need_fence, need_mappable; > + bool need_fence; > int ret; > > need_fence = > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false); > + ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false); > if (ret) > return ret; > > @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > INIT_LIST_HEAD(&ordered_objects); > while (!list_empty(objects)) { > struct drm_i915_gem_exec_object2 *entry; > - bool need_fence, need_mappable; > + bool need_fence; > > obj = list_first_entry(objects, > struct drm_i915_gem_object, > @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > - if (need_mappable) > + if (need_fence) > list_move(&obj->exec_list, &ordered_objects); > else > list_move_tail(&obj->exec_list, &ordered_objects); > @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > /* Unbind any ill-fitting objects or pin. */ > list_for_each_entry(obj, objects, exec_list) { > struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; > - bool need_fence, need_mappable; > + bool need_fence; > > if (!obj->gtt_space) > continue; > @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, > has_fenced_gpu_access && > entry->flags & EXEC_OBJECT_NEEDS_FENCE && > obj->tiling_mode != I915_TILING_NONE; > - need_mappable = need_fence || need_reloc_mappable(obj); > > if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) || > - (need_mappable && !obj->map_and_fenceable)) > + (need_fence && !obj->map_and_fenceable)) > ret = i915_gem_object_unbind(obj); > else > ret = i915_gem_execbuffer_reserve_object(obj, ring); > @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, > if (ret) > return ret; > > + flush_domains |= obj->base.write_domain; > + > if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) > i915_gem_clflush_object(obj); > > - flush_domains |= obj->base.write_domain; Can we just drop this? > + /* Used as an internal marker during relocation processing */ > + if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) { > + flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS; > + obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS; > + } We /should/ filter out !GPU_DOMAINS already with return -EINVAL, so why the changes here? Or can we just drop them? > } > > if (flush_domains & I915_GEM_DOMAIN_CPU) > @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > } > > /* Set the pending read domains for the batch buffer to COMMAND */ > - if (batch_obj->base.pending_write_domain) { > + if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) { dito. Cheers, Daniel > DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); > ret = -EINVAL; > goto err; > -- > 1.7.10.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 163bb52..daa5409 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -33,6 +33,9 @@ #include "intel_drv.h" #include <linux/dma_remapping.h> +#define __EXEC_OBJECT_HAS_PIN (1<<31) +#define __EXEC_OBJECT_HAS_FENCE (1<<30) + struct eb_objects { int and; struct hlist_head buckets[0]; @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb) static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) { return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || - !obj->map_and_fenceable || obj->cache_level != I915_CACHE_NONE); } +static inline struct page * +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset) +{ + offset -= obj->gtt_space->start; + return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT); +} + static int i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, struct eb_objects *eb, @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, return -EFAULT; reloc->delta += target_offset; + reloc->offset += obj->gtt_offset; if (use_cpu_reloc(obj)) { - uint32_t page_offset = reloc->offset & ~PAGE_MASK; char *vaddr; - ret = i915_gem_object_set_to_cpu_domain(obj, 1); + ret = i915_gem_object_set_to_cpu_domain(obj, true); if (ret) return ret; - vaddr = kmap_atomic(i915_gem_object_get_page(obj, - reloc->offset >> PAGE_SHIFT)); - *(uint32_t *)(vaddr + page_offset) = reloc->delta; + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); + *(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta; kunmap_atomic(vaddr); } else { struct drm_i915_private *dev_priv = dev->dev_private; - uint32_t __iomem *reloc_entry; - void __iomem *reloc_page; + unsigned page_offset; ret = i915_gem_object_set_to_gtt_domain(obj, true); if (ret) @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, return ret; /* Map the page containing the relocation we're going to perform. */ - reloc->offset += obj->gtt_offset; - reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, - reloc->offset & PAGE_MASK); - reloc_entry = (uint32_t __iomem *) - (reloc_page + (reloc->offset & ~PAGE_MASK)); - iowrite32(reloc->delta, reloc_entry); - io_mapping_unmap_atomic(reloc_page); + page_offset = offset_in_page(reloc->offset); + + if (reloc->offset < dev_priv->mm.gtt_mappable_end) { + void __iomem *reloc_page; + + reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping, + reloc->offset & PAGE_MASK); + iowrite32(reloc->delta, reloc_page + page_offset); + io_mapping_unmap_atomic(reloc_page); + } else { + char *vaddr; + + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset)); + + drm_clflush_virt_range(vaddr + page_offset, 4); + *(uint32_t *)(vaddr + page_offset) = reloc->delta; + drm_clflush_virt_range(vaddr + page_offset, 4); + + kunmap_atomic(vaddr); + + obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU; + } } /* and update the user's relocation entry */ @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, return ret; } -#define __EXEC_OBJECT_HAS_PIN (1<<31) -#define __EXEC_OBJECT_HAS_FENCE (1<<30) - -static int -need_reloc_mappable(struct drm_i915_gem_object *obj) -{ - struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; - return entry->relocation_count && !use_cpu_reloc(obj); -} - static int i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, struct intel_ring_buffer *ring) @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, struct drm_i915_private *dev_priv = obj->base.dev->dev_private; struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; - bool need_fence, need_mappable; + bool need_fence; int ret; need_fence = has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = need_fence || need_reloc_mappable(obj); - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false); + ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false); if (ret) return ret; @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, INIT_LIST_HEAD(&ordered_objects); while (!list_empty(objects)) { struct drm_i915_gem_exec_object2 *entry; - bool need_fence, need_mappable; + bool need_fence; obj = list_first_entry(objects, struct drm_i915_gem_object, @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = need_fence || need_reloc_mappable(obj); - if (need_mappable) + if (need_fence) list_move(&obj->exec_list, &ordered_objects); else list_move_tail(&obj->exec_list, &ordered_objects); @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, /* Unbind any ill-fitting objects or pin. */ list_for_each_entry(obj, objects, exec_list) { struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; - bool need_fence, need_mappable; + bool need_fence; if (!obj->gtt_space) continue; @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = need_fence || need_reloc_mappable(obj); if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) || - (need_mappable && !obj->map_and_fenceable)) + (need_fence && !obj->map_and_fenceable)) ret = i915_gem_object_unbind(obj); else ret = i915_gem_execbuffer_reserve_object(obj, ring); @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, if (ret) return ret; + flush_domains |= obj->base.write_domain; + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) i915_gem_clflush_object(obj); - flush_domains |= obj->base.write_domain; + /* Used as an internal marker during relocation processing */ + if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) { + flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS; + obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS; + } } if (flush_domains & I915_GEM_DOMAIN_CPU) @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, } /* Set the pending read domains for the batch buffer to COMMAND */ - if (batch_obj->base.pending_write_domain) { + if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) { DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); ret = -EINVAL; goto err;
If the object lies outside of the mappable GTT aperture, do not force it through the CPU domain for relocations, but simply flush the writes as we perform them and then queue a chipset flush. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++------------ 1 file changed, 51 insertions(+), 36 deletions(-)