Message ID | 1409693561-1669-3-git-send-email-jbarnes@virtuousgeek.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote: > Use a new reloc type to allow userspace to insert sync points within > batches before they're submitted. The corresponding fence fds are > returned in the offset field of the returned reloc tree, and can be > operated on with the sync fence APIs. > > Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org> > --- > drivers/gpu/drm/i915/i915_drv.h | 4 + > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++----- > drivers/gpu/drm/i915/i915_sync.c | 58 ++++++++++--- > include/uapi/drm/i915_drm.h | 11 ++- > 4 files changed, 167 insertions(+), 31 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 6eb119e..410eedf 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv); > void i915_sync_fini(struct drm_i915_private *dev_priv); > int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data, > struct drm_file *file); > +int i915_sync_fence_create(struct intel_engine_cs *ring, > + struct intel_context *ctx, > + u32 seqno); > + > > #define PIN_MAPPABLE 0x1 > #define PIN_NONBLOCK 0x2 > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index 60998fc..32ec599 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -32,6 +32,7 @@ > #include "i915_trace.h" > #include "intel_drv.h" > #include <linux/dma_remapping.h> > +#include "../../../staging/android/sync.h" > > #define __EXEC_OBJECT_HAS_PIN (1<<31) > #define __EXEC_OBJECT_HAS_FENCE (1<<30) > @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > !obj->map_and_fenceable || > obj->cache_level != I915_CACHE_NONE); > } > +static int > +emit_sync_obj_cpu(struct drm_i915_gem_object *obj, > + struct drm_i915_gem_relocation_entry *reloc) > +{ > + uint32_t page_offset = offset_in_page(reloc->offset); > + char *vaddr; > + int ret; > + > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > + if (ret) > + return ret; > + > + vaddr = kmap_atomic(i915_gem_object_get_page(obj, > + reloc->offset >> PAGE_SHIFT)); > + *(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX; > + *(uint32_t *)(vaddr + page_offset + 4) = > + I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; > + *(uint32_t *)(vaddr + page_offset + 8) = > + obj->ring->outstanding_lazy_seqno; > + *(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT; > + > + kunmap_atomic(vaddr); > + > + return 0; > +} > + > +static int > +emit_sync_obj_gtt(struct drm_i915_gem_object *obj, > + struct drm_i915_gem_relocation_entry *reloc) > +{ > + struct drm_device *dev = obj->base.dev; > + struct drm_i915_private *dev_priv = dev->dev_private; > + uint32_t __iomem *reloc_entry; > + void __iomem *reloc_page; > + int ret; > + > + ret = i915_gem_object_set_to_gtt_domain(obj, true); > + if (ret) > + return ret; > + > + ret = i915_gem_object_put_fence(obj); > + if (ret) > + return ret; > + > + /* Map the page containing the relocation we're going to perform. */ > + reloc->offset += i915_gem_obj_ggtt_offset(obj); > + reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable, > + reloc->offset & PAGE_MASK); > + > + reloc_entry = (uint32_t __iomem *) > + (reloc_page + offset_in_page(reloc->offset)); > + iowrite32(MI_STORE_DWORD_INDEX, reloc_entry); > + iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT, > + reloc_entry); > + iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry); > + iowrite32(MI_USER_INTERRUPT, reloc_entry); > + > + io_mapping_unmap_atomic(reloc_page); These commands are illegal/invalid inside the object, only valid inside the ring. > + return 0; > +} > > static int > relocate_entry_cpu(struct drm_i915_gem_object *obj, > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > static int > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > struct eb_vmas *eb, > - struct drm_i915_gem_relocation_entry *reloc) > + struct drm_i915_gem_relocation_entry *reloc, > + struct intel_context *ctx) Hmm. That's a nuisance. But no, you only use it to automatically create a fence not to patch the batch, so you can just use an object-flag. This fits neatly into requests. -Chris
On Wed, 3 Sep 2014 08:01:55 +0100 Chris Wilson <chris@chris-wilson.co.uk> wrote: > On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote: > > Use a new reloc type to allow userspace to insert sync points within > > batches before they're submitted. The corresponding fence fds are > > returned in the offset field of the returned reloc tree, and can be > > operated on with the sync fence APIs. > > > > Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org> > > --- > > drivers/gpu/drm/i915/i915_drv.h | 4 + > > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++----- > > drivers/gpu/drm/i915/i915_sync.c | 58 ++++++++++--- > > include/uapi/drm/i915_drm.h | 11 ++- > > 4 files changed, 167 insertions(+), 31 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 6eb119e..410eedf 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv); > > void i915_sync_fini(struct drm_i915_private *dev_priv); > > int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data, > > struct drm_file *file); > > +int i915_sync_fence_create(struct intel_engine_cs *ring, > > + struct intel_context *ctx, > > + u32 seqno); > > + > > > > #define PIN_MAPPABLE 0x1 > > #define PIN_NONBLOCK 0x2 > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > > index 60998fc..32ec599 100644 > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > > @@ -32,6 +32,7 @@ > > #include "i915_trace.h" > > #include "intel_drv.h" > > #include <linux/dma_remapping.h> > > +#include "../../../staging/android/sync.h" > > > > #define __EXEC_OBJECT_HAS_PIN (1<<31) > > #define __EXEC_OBJECT_HAS_FENCE (1<<30) > > @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) > > !obj->map_and_fenceable || > > obj->cache_level != I915_CACHE_NONE); > > } > > +static int > > +emit_sync_obj_cpu(struct drm_i915_gem_object *obj, > > + struct drm_i915_gem_relocation_entry *reloc) > > +{ > > + uint32_t page_offset = offset_in_page(reloc->offset); > > + char *vaddr; > > + int ret; > > + > > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > > + if (ret) > > + return ret; > > + > > + vaddr = kmap_atomic(i915_gem_object_get_page(obj, > > + reloc->offset >> PAGE_SHIFT)); > > + *(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX; > > + *(uint32_t *)(vaddr + page_offset + 4) = > > + I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; > > + *(uint32_t *)(vaddr + page_offset + 8) = > > + obj->ring->outstanding_lazy_seqno; > > + *(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT; > > + > > + kunmap_atomic(vaddr); > > + > > + return 0; > > +} > > + > > +static int > > +emit_sync_obj_gtt(struct drm_i915_gem_object *obj, > > + struct drm_i915_gem_relocation_entry *reloc) > > +{ > > + struct drm_device *dev = obj->base.dev; > > + struct drm_i915_private *dev_priv = dev->dev_private; > > + uint32_t __iomem *reloc_entry; > > + void __iomem *reloc_page; > > + int ret; > > + > > + ret = i915_gem_object_set_to_gtt_domain(obj, true); > > + if (ret) > > + return ret; > > + > > + ret = i915_gem_object_put_fence(obj); > > + if (ret) > > + return ret; > > + > > + /* Map the page containing the relocation we're going to perform. */ > > + reloc->offset += i915_gem_obj_ggtt_offset(obj); > > + reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable, > > + reloc->offset & PAGE_MASK); > > + > > + reloc_entry = (uint32_t __iomem *) > > + (reloc_page + offset_in_page(reloc->offset)); > > + iowrite32(MI_STORE_DWORD_INDEX, reloc_entry); > > + iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT, > > + reloc_entry); > > + iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry); > > + iowrite32(MI_USER_INTERRUPT, reloc_entry); > > + > > + io_mapping_unmap_atomic(reloc_page); > > These commands are illegal/invalid inside the object, only valid inside > the ring. Hm, we ought to be able to write to no privileged space with STORE_DWORD, but that does mean moving to context specific pages in process space, or at least adding them to our existing scheme. I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from a non-privileged batch that nixes one of the other neat features we could have (fine grained intra-batch userspace synchronization). > > + return 0; > > +} > > > > static int > > relocate_entry_cpu(struct drm_i915_gem_object *obj, > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > > static int > > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > > struct eb_vmas *eb, > > - struct drm_i915_gem_relocation_entry *reloc) > > + struct drm_i915_gem_relocation_entry *reloc, > > + struct intel_context *ctx) > > Hmm. That's a nuisance. But no, you only use it to automatically create > a fence not to patch the batch, so you can just use an object-flag. > > This fits neatly into requests. Most definitely. What do you think of the potential upside in the DDX for this, assuming we get dword writes from batches working?
On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote: > On Wed, 3 Sep 2014 08:01:55 +0100 > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > These commands are illegal/invalid inside the object, only valid inside > > the ring. > > Hm, we ought to be able to write to no privileged space with > STORE_DWORD, but that does mean moving to context specific pages in > process space, or at least adding them to our existing scheme. The per-process context page also doesn't exist generically. I certainly hope that userspace can't overwrite the hws! Imagine if we were using that for interrupt status reads, or seqno tracking... > I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from > a non-privileged batch that nixes one of the other neat features we > could have (fine grained intra-batch userspace synchronization). I don't understand how writing the operation into the batch is beneficial vs writing into the ring, unless you instended to use something more fine grained than the batch seqno. You want to get interrupts from inside batches? Rather than continue the existing scheme of splitting up batches between fences? I definitely think we should think twice before allowing userspace to arbitrarily generate interrupts. > > > + return 0; > > > +} > > > > > > static int > > > relocate_entry_cpu(struct drm_i915_gem_object *obj, > > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > > > static int > > > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > > > struct eb_vmas *eb, > > > - struct drm_i915_gem_relocation_entry *reloc) > > > + struct drm_i915_gem_relocation_entry *reloc, > > > + struct intel_context *ctx) > > > > Hmm. That's a nuisance. But no, you only use it to automatically create > > a fence not to patch the batch, so you can just use an object-flag. > > > > This fits neatly into requests. > > Most definitely. What do you think of the potential upside in the DDX > for this, assuming we get dword writes from batches working? Negative. You now have relocation overhead, you still need to split batches to keep the gpu busy and do ring switches, and context switching between clients, so I don't feel a need for fences from inside a batch. Getting seqno and a hws in the client would be nice, but if it continues to require kernel polling, no thanks, I'll just still to approximately tracking the active state of surfaces with the heavier accurate queries sparingly. About the only thing I could see as being useful is that it would allow you to reuse a batch buffer multiple times, rather than overallocate a whole page and keep a pool of such pages. I am missing something? -Chris
On Wed, 3 Sep 2014 17:08:53 +0100 Chris Wilson <chris@chris-wilson.co.uk> wrote: > On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote: > > On Wed, 3 Sep 2014 08:01:55 +0100 > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > These commands are illegal/invalid inside the object, only valid inside > > > the ring. > > > > Hm, we ought to be able to write to no privileged space with > > STORE_DWORD, but that does mean moving to context specific pages in > > process space, or at least adding them to our existing scheme. > > The per-process context page also doesn't exist generically. I certainly > hope that userspace can't overwrite the hws! Imagine if we were using > that for interrupt status reads, or seqno tracking... Yeah I'm thinking of an additional hws that's per-context and userspace mappable. It could come in handy for userspace only sync stuff. > > > I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from > > a non-privileged batch that nixes one of the other neat features we > > could have (fine grained intra-batch userspace synchronization). > > I don't understand how writing the operation into the batch is > beneficial vs writing into the ring, unless you instended to use > something more fine grained than the batch seqno. You want to get > interrupts from inside batches? Rather than continue the existing scheme > of splitting up batches between fences? Yeah, the whole idea here was to avoid flushing batches in order to emit fences, both to avoid overhead and give userspace more rope. > > I definitely think we should think twice before allowing userspace to > arbitrarily generate interrupts. > > > > > + return 0; > > > > +} > > > > > > > > static int > > > > relocate_entry_cpu(struct drm_i915_gem_object *obj, > > > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, > > > > static int > > > > i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, > > > > struct eb_vmas *eb, > > > > - struct drm_i915_gem_relocation_entry *reloc) > > > > + struct drm_i915_gem_relocation_entry *reloc, > > > > + struct intel_context *ctx) > > > > > > Hmm. That's a nuisance. But no, you only use it to automatically create > > > a fence not to patch the batch, so you can just use an object-flag. > > > > > > This fits neatly into requests. > > > > Most definitely. What do you think of the potential upside in the DDX > > for this, assuming we get dword writes from batches working? > > Negative. You now have relocation overhead, you still need to split > batches to keep the gpu busy and do ring switches, and context switching > between clients, so I don't feel a need for fences from inside a batch. > > Getting seqno and a hws in the client would be nice, but if it continues > to require kernel polling, no thanks, I'll just still to approximately > tracking the active state of surfaces with the heavier accurate queries > sparingly. > > About the only thing I could see as being useful is that it would allow > you to reuse a batch buffer multiple times, rather than overallocate a > whole page and keep a pool of such pages. > > I am missing something? No I think that's about right. The need for reloc processing is a definite downside to this approach, but that could be solved with a new interface, or by just allowing userspace to map/manage a hws. The downside there is that the resulting fences wouldn't be shareable. But requiring a flush for that is probably fine.
On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > On Wed, 3 Sep 2014 17:08:53 +0100 > Chris Wilson <chris@chris-wilson.co.uk> wrote: >> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote: >> > On Wed, 3 Sep 2014 08:01:55 +0100 >> > Chris Wilson <chris@chris-wilson.co.uk> wrote: >> > >> > > These commands are illegal/invalid inside the object, only valid inside >> > > the ring. >> > >> > Hm, we ought to be able to write to no privileged space with >> > STORE_DWORD, but that does mean moving to context specific pages in >> > process space, or at least adding them to our existing scheme. >> >> The per-process context page also doesn't exist generically. I certainly >> hope that userspace can't overwrite the hws! Imagine if we were using >> that for interrupt status reads, or seqno tracking... > > Yeah I'm thinking of an additional hws that's per-context and userspace > mappable. It could come in handy for userspace only sync stuff. Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL - lots of igt tests actually do that for correctness checks. So the only thing really is interrupts, and I think for that we really want the full request tracking machinery in the kernel (otherwise I fear we'll have even more fun with lost/spurious interrupts since the hw guys just seem to not be able to get that right). Which means a full batch split. I have no idea how that's supposed to work when userspace does direct hardware submission. But that's kinda a good reason not to do that anyway, and at least for now it looks like direct hw submission is for opencl2 only with interop with other devices (where sync matters) not a use-case. For interop with other processes the gpu can always do a seqno write to some shared page. And busy-looping, but apparently that's what people want for low-latency. Or at least what designers seem to think people want ... -Daniel
On Wed, 3 Sep 2014 21:41:02 +0200 Daniel Vetter <daniel@ffwll.ch> wrote: > On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > On Wed, 3 Sep 2014 17:08:53 +0100 > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > >> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote: > >> > On Wed, 3 Sep 2014 08:01:55 +0100 > >> > Chris Wilson <chris@chris-wilson.co.uk> wrote: > >> > > >> > > These commands are illegal/invalid inside the object, only valid inside > >> > > the ring. > >> > > >> > Hm, we ought to be able to write to no privileged space with > >> > STORE_DWORD, but that does mean moving to context specific pages in > >> > process space, or at least adding them to our existing scheme. > >> > >> The per-process context page also doesn't exist generically. I certainly > >> hope that userspace can't overwrite the hws! Imagine if we were using > >> that for interrupt status reads, or seqno tracking... > > > > Yeah I'm thinking of an additional hws that's per-context and userspace > > mappable. It could come in handy for userspace only sync stuff. > > Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL > - lots of igt tests actually do that for correctness checks. So the > only thing really is interrupts, and I think for that we really want > the full request tracking machinery in the kernel (otherwise I fear > we'll have even more fun with lost/spurious interrupts since the hw > guys just seem to not be able to get that right). Which means a full > batch split. > > I have no idea how that's supposed to work when userspace does direct > hardware submission. But that's kinda a good reason not to do that > anyway, and at least for now it looks like direct hw submission is for > opencl2 only with interop with other devices (where sync matters) not > a use-case. For interop with other processes the gpu can always do a > seqno write to some shared page. And busy-looping, but apparently > that's what people want for low-latency. Or at least what designers > seem to think people want ... Yeah I haven't thought how direct submission will work in terms of IPC. It may just have to be done in userland with a custom cooperative mechanism...
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6eb119e..410eedf 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv); void i915_sync_fini(struct drm_i915_private *dev_priv); int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data, struct drm_file *file); +int i915_sync_fence_create(struct intel_engine_cs *ring, + struct intel_context *ctx, + u32 seqno); + #define PIN_MAPPABLE 0x1 #define PIN_NONBLOCK 0x2 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 60998fc..32ec599 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -32,6 +32,7 @@ #include "i915_trace.h" #include "intel_drv.h" #include <linux/dma_remapping.h> +#include "../../../staging/android/sync.h" #define __EXEC_OBJECT_HAS_PIN (1<<31) #define __EXEC_OBJECT_HAS_FENCE (1<<30) @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) !obj->map_and_fenceable || obj->cache_level != I915_CACHE_NONE); } +static int +emit_sync_obj_cpu(struct drm_i915_gem_object *obj, + struct drm_i915_gem_relocation_entry *reloc) +{ + uint32_t page_offset = offset_in_page(reloc->offset); + char *vaddr; + int ret; + + ret = i915_gem_object_set_to_cpu_domain(obj, true); + if (ret) + return ret; + + vaddr = kmap_atomic(i915_gem_object_get_page(obj, + reloc->offset >> PAGE_SHIFT)); + *(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX; + *(uint32_t *)(vaddr + page_offset + 4) = + I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; + *(uint32_t *)(vaddr + page_offset + 8) = + obj->ring->outstanding_lazy_seqno; + *(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT; + + kunmap_atomic(vaddr); + + return 0; +} + +static int +emit_sync_obj_gtt(struct drm_i915_gem_object *obj, + struct drm_i915_gem_relocation_entry *reloc) +{ + struct drm_device *dev = obj->base.dev; + struct drm_i915_private *dev_priv = dev->dev_private; + uint32_t __iomem *reloc_entry; + void __iomem *reloc_page; + int ret; + + ret = i915_gem_object_set_to_gtt_domain(obj, true); + if (ret) + return ret; + + ret = i915_gem_object_put_fence(obj); + if (ret) + return ret; + + /* Map the page containing the relocation we're going to perform. */ + reloc->offset += i915_gem_obj_ggtt_offset(obj); + reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable, + reloc->offset & PAGE_MASK); + + reloc_entry = (uint32_t __iomem *) + (reloc_page + offset_in_page(reloc->offset)); + iowrite32(MI_STORE_DWORD_INDEX, reloc_entry); + iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT, + reloc_entry); + iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry); + iowrite32(MI_USER_INTERRUPT, reloc_entry); + + io_mapping_unmap_atomic(reloc_page); + + return 0; +} static int relocate_entry_cpu(struct drm_i915_gem_object *obj, @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj, static int i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, struct eb_vmas *eb, - struct drm_i915_gem_relocation_entry *reloc) + struct drm_i915_gem_relocation_entry *reloc, + struct intel_context *ctx) { struct drm_device *dev = obj->base.dev; struct drm_gem_object *target_obj; @@ -433,23 +496,39 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, if (obj->active && in_atomic()) return -EFAULT; - if (use_cpu_reloc(obj)) - ret = relocate_entry_cpu(obj, reloc, target_offset); - else - ret = relocate_entry_gtt(obj, reloc, target_offset); + if (reloc->write_domain & I915_GEM_DOMAIN_SYNC_OBJ) { + int fd; + + /* get a new seqno */ + intel_ring_begin(obj->ring, 0); + + if (use_cpu_reloc(obj)) + ret = emit_sync_obj_cpu(obj, reloc); + else + ret = emit_sync_obj_gtt(obj, reloc); + + fd = i915_sync_fence_create(obj->ring, ctx, + obj->ring->outstanding_lazy_seqno); + reloc->presumed_offset = fd; + } else { + if (use_cpu_reloc(obj)) + ret = relocate_entry_cpu(obj, reloc, target_offset); + else + ret = relocate_entry_gtt(obj, reloc, target_offset); + /* and update the user's relocation entry */ + reloc->presumed_offset = target_offset; + } if (ret) return ret; - /* and update the user's relocation entry */ - reloc->presumed_offset = target_offset; - return 0; } static int i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, - struct eb_vmas *eb) + struct eb_vmas *eb, + struct intel_context *ctx) { #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)]; @@ -473,7 +552,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, do { u64 offset = r->presumed_offset; - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r); + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, ctx); if (ret) return ret; @@ -496,13 +575,14 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, static int i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma, struct eb_vmas *eb, - struct drm_i915_gem_relocation_entry *relocs) + struct drm_i915_gem_relocation_entry *relocs, + struct intel_context *ctx) { const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; int i, ret; for (i = 0; i < entry->relocation_count; i++) { - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]); + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], ctx); if (ret) return ret; } @@ -511,7 +591,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma, } static int -i915_gem_execbuffer_relocate(struct eb_vmas *eb) +i915_gem_execbuffer_relocate(struct eb_vmas *eb, struct intel_context *ctx) { struct i915_vma *vma; int ret = 0; @@ -525,7 +605,7 @@ i915_gem_execbuffer_relocate(struct eb_vmas *eb) */ pagefault_disable(); list_for_each_entry(vma, &eb->vmas, exec_list) { - ret = i915_gem_execbuffer_relocate_vma(vma, eb); + ret = i915_gem_execbuffer_relocate_vma(vma, eb, ctx); if (ret) break; } @@ -664,6 +744,13 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *ring, obj->tiling_mode != I915_TILING_NONE; need_mappable = need_fence || need_reloc_mappable(vma); + /* + * If we're emitting a sync obj, we always need a reloc + * pass to write the seqno. + */ + if (entry->flags & EXEC_OBJECT_SYNC_OBJ) + *need_relocs = true; + if (need_mappable) list_move(&vma->exec_list, &ordered_vmas); else @@ -734,7 +821,8 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev, struct drm_file *file, struct intel_engine_cs *ring, struct eb_vmas *eb, - struct drm_i915_gem_exec_object2 *exec) + struct drm_i915_gem_exec_object2 *exec, + struct intel_context *ctx) { struct drm_i915_gem_relocation_entry *reloc; struct i915_address_space *vm; @@ -830,7 +918,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev, list_for_each_entry(vma, &eb->vmas, exec_list) { int offset = vma->exec_entry - exec; ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb, - reloc + reloc_offset[offset]); + reloc + reloc_offset[offset], ctx); if (ret) goto err; } @@ -1340,17 +1428,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, /* Move the objects en-masse into the GTT, evicting if necessary. */ need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0; + ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs); if (ret) goto err; /* The objects are in their final locations, apply the relocations. */ if (need_relocs) - ret = i915_gem_execbuffer_relocate(eb); + ret = i915_gem_execbuffer_relocate(eb, ctx); if (ret) { if (ret == -EFAULT) { ret = i915_gem_execbuffer_relocate_slow(dev, args, file, ring, - eb, exec); + eb, exec, ctx); BUG_ON(!mutex_is_locked(&dev->struct_mutex)); } if (ret) diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c index 4938616..bd54fca 100644 --- a/drivers/gpu/drm/i915/i915_sync.c +++ b/drivers/gpu/drm/i915/i915_sync.c @@ -195,32 +195,72 @@ static struct fence_ops i915_fence_ops = { .timeline_value_str = i915_fence_timeline_value_str, }; -static struct fence *i915_fence_create(struct intel_engine_cs *ring, - struct intel_context *ctx) +static struct i915_fence *__i915_fence_create(struct intel_engine_cs *ring, + struct intel_context *ctx, + u32 seqno) { struct i915_fence *fence; - int ret; fence = kzalloc(sizeof(*fence), GFP_KERNEL); if (!fence) return NULL; + fence->ring = ring; + fence->ctx = ctx; + fence->seqno = ring->outstanding_lazy_seqno; + fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle, + fence->seqno); + + return fence; +} + +static struct fence *i915_fence_create(struct intel_engine_cs *ring, + struct intel_context *ctx) +{ + struct i915_fence *fence; + int ret; + ret = ring->add_request(ring); if (ret) { DRM_ERROR("add_request failed\n"); - fence_free((struct fence *)fence); return NULL; } - fence->ring = ring; - fence->ctx = ctx; - fence->seqno = ring->outstanding_lazy_seqno; - fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle, - fence->seqno); + fence = __i915_fence_create(ring, ctx, ring->outstanding_lazy_seqno); return &fence->base; } +int i915_sync_fence_create(struct intel_engine_cs *ring, + struct intel_context *ctx, + u32 seqno) +{ + struct i915_fence *fence; + struct sync_fence *sfence; + char name[64]; + int fd = get_unused_fd_flags(O_CLOEXEC); + + fence = __i915_fence_create(ring, ctx, seqno); + if (!fence) { + fd = -ENOMEM; + goto err; + } + + snprintf(name, sizeof(name), "0x%08x:0x%08x", + ctx->user_handle, seqno); + sfence = sync_fence_create_dma(name, &fence->base); + if (!sfence) { + fence_free((struct fence *)fence); + fd = -ENOMEM; + goto err; + } + + sync_fence_install(sfence, fd); + +err: + return fd; +} + /** * i915_sync_create_fence_ioctl - fence creation function * @dev: drm device diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 65bd271..edadab2 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -585,6 +585,8 @@ struct drm_i915_gem_relocation_entry { #define I915_GEM_DOMAIN_VERTEX 0x00000020 /** GTT domain - aperture and scanout */ #define I915_GEM_DOMAIN_GTT 0x00000040 +/** Sync object - special for inline fences */ +#define I915_GEM_DOMAIN_SYNC_OBJ 0x00000080 /** @} */ struct drm_i915_gem_exec_object { @@ -661,10 +663,11 @@ struct drm_i915_gem_exec_object2 { */ __u64 offset; -#define EXEC_OBJECT_NEEDS_FENCE (1<<0) -#define EXEC_OBJECT_NEEDS_GTT (1<<1) -#define EXEC_OBJECT_WRITE (1<<2) -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1) +#define EXEC_OBJECT_NEEDS_FENCE (1<<0) /* requires fence regsiter */ +#define EXEC_OBJECT_NEEDS_GTT (1<<1) /* needs global GTT mapping */ +#define EXEC_OBJECT_WRITE (1<<2) /* object will be written */ +#define EXEC_OBJECT_SYNC_OBJ (1<<3) /* emit a sync obj instead */ +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SYNC_OBJ<<1) __u64 flags; __u64 rsvd1;
Use a new reloc type to allow userspace to insert sync points within batches before they're submitted. The corresponding fence fds are returned in the offset field of the returned reloc tree, and can be operated on with the sync fence APIs. Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org> --- drivers/gpu/drm/i915/i915_drv.h | 4 + drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++----- drivers/gpu/drm/i915/i915_sync.c | 58 ++++++++++--- include/uapi/drm/i915_drm.h | 11 ++- 4 files changed, 167 insertions(+), 31 deletions(-)