diff mbox

[2/2] drm/i915: allow sync points within batches

Message ID 1409693561-1669-3-git-send-email-jbarnes@virtuousgeek.org (mailing list archive)
State New, archived
Headers show

Commit Message

Jesse Barnes Sept. 2, 2014, 9:32 p.m. UTC
Use a new reloc type to allow userspace to insert sync points within
batches before they're submitted.  The corresponding fence fds are
returned in the offset field of the returned reloc tree, and can be
operated on with the sync fence APIs.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |   4 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
 include/uapi/drm/i915_drm.h                |  11 ++-
 4 files changed, 167 insertions(+), 31 deletions(-)

Comments

Chris Wilson Sept. 3, 2014, 7:01 a.m. UTC | #1
On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote:
> Use a new reloc type to allow userspace to insert sync points within
> batches before they're submitted.  The corresponding fence fds are
> returned in the offset field of the returned reloc tree, and can be
> operated on with the sync fence APIs.
> 
> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |   4 +
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
>  include/uapi/drm/i915_drm.h                |  11 ++-
>  4 files changed, 167 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 6eb119e..410eedf 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
>  void i915_sync_fini(struct drm_i915_private *dev_priv);
>  int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
>  				 struct drm_file *file);
> +int i915_sync_fence_create(struct intel_engine_cs *ring,
> +			   struct intel_context *ctx,
> +			   u32 seqno);
> +
>  
>  #define PIN_MAPPABLE 0x1
>  #define PIN_NONBLOCK 0x2
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 60998fc..32ec599 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -32,6 +32,7 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
> +#include "../../../staging/android/sync.h"
>  
>  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
> +static int
> +emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
> +		  struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	uint32_t page_offset = offset_in_page(reloc->offset);
> +	char *vaddr;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> +	if (ret)
> +		return ret;
> +
> +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> +				reloc->offset >> PAGE_SHIFT));
> +	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
> +	*(uint32_t *)(vaddr + page_offset + 4) =
> +		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
> +	*(uint32_t *)(vaddr + page_offset + 8) =
> +		obj->ring->outstanding_lazy_seqno;
> +	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
> +
> +	kunmap_atomic(vaddr);
> +
> +	return 0;
> +}
> +
> +static int
> +emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
> +		  struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	struct drm_device *dev = obj->base.dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	uint32_t __iomem *reloc_entry;
> +	void __iomem *reloc_page;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> +	if (ret)
> +		return ret;
> +
> +	ret = i915_gem_object_put_fence(obj);
> +	if (ret)
> +		return ret;
> +
> +	/* Map the page containing the relocation we're going to perform.  */
> +	reloc->offset += i915_gem_obj_ggtt_offset(obj);
> +	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
> +			reloc->offset & PAGE_MASK);
> +
> +	reloc_entry = (uint32_t __iomem *)
> +		(reloc_page + offset_in_page(reloc->offset));
> +	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
> +	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
> +		  reloc_entry);
> +	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
> +	iowrite32(MI_USER_INTERRUPT, reloc_entry);
> +
> +	io_mapping_unmap_atomic(reloc_page);

These commands are illegal/invalid inside the object, only valid inside
the ring.

> +	return 0;
> +}
>  
>  static int
>  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_vmas *eb,
> -				   struct drm_i915_gem_relocation_entry *reloc)
> +				   struct drm_i915_gem_relocation_entry *reloc,
> +				   struct intel_context *ctx)

Hmm. That's a nuisance. But no, you only use it to automatically create
a fence not to patch the batch, so you can just use an object-flag.

This fits neatly into requests.
-Chris
Jesse Barnes Sept. 3, 2014, 3:41 p.m. UTC | #2
On Wed, 3 Sep 2014 08:01:55 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote:
> > Use a new reloc type to allow userspace to insert sync points within
> > batches before they're submitted.  The corresponding fence fds are
> > returned in the offset field of the returned reloc tree, and can be
> > operated on with the sync fence APIs.
> > 
> > Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h            |   4 +
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
> >  drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
> >  include/uapi/drm/i915_drm.h                |  11 ++-
> >  4 files changed, 167 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6eb119e..410eedf 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
> >  void i915_sync_fini(struct drm_i915_private *dev_priv);
> >  int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
> >  				 struct drm_file *file);
> > +int i915_sync_fence_create(struct intel_engine_cs *ring,
> > +			   struct intel_context *ctx,
> > +			   u32 seqno);
> > +
> >  
> >  #define PIN_MAPPABLE 0x1
> >  #define PIN_NONBLOCK 0x2
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 60998fc..32ec599 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -32,6 +32,7 @@
> >  #include "i915_trace.h"
> >  #include "intel_drv.h"
> >  #include <linux/dma_remapping.h>
> > +#include "../../../staging/android/sync.h"
> >  
> >  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
> >  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> > @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> >  		!obj->map_and_fenceable ||
> >  		obj->cache_level != I915_CACHE_NONE);
> >  }
> > +static int
> > +emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	uint32_t page_offset = offset_in_page(reloc->offset);
> > +	char *vaddr;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> > +				reloc->offset >> PAGE_SHIFT));
> > +	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
> > +	*(uint32_t *)(vaddr + page_offset + 4) =
> > +		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
> > +	*(uint32_t *)(vaddr + page_offset + 8) =
> > +		obj->ring->outstanding_lazy_seqno;
> > +	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
> > +
> > +	kunmap_atomic(vaddr);
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	struct drm_device *dev = obj->base.dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	uint32_t __iomem *reloc_entry;
> > +	void __iomem *reloc_page;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = i915_gem_object_put_fence(obj);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Map the page containing the relocation we're going to perform.  */
> > +	reloc->offset += i915_gem_obj_ggtt_offset(obj);
> > +	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
> > +			reloc->offset & PAGE_MASK);
> > +
> > +	reloc_entry = (uint32_t __iomem *)
> > +		(reloc_page + offset_in_page(reloc->offset));
> > +	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
> > +	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
> > +		  reloc_entry);
> > +	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
> > +	iowrite32(MI_USER_INTERRUPT, reloc_entry);
> > +
> > +	io_mapping_unmap_atomic(reloc_page);
> 
> These commands are illegal/invalid inside the object, only valid inside
> the ring.

Hm, we ought to be able to write to no privileged space with
STORE_DWORD, but that does mean moving to context specific pages in
process space, or at least adding them to our existing scheme.

I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
a non-privileged batch that nixes one of the other neat features we
could have (fine grained intra-batch userspace synchronization).

> > +	return 0;
> > +}
> >  
> >  static int
> >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> >  static int
> >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> >  				   struct eb_vmas *eb,
> > -				   struct drm_i915_gem_relocation_entry *reloc)
> > +				   struct drm_i915_gem_relocation_entry *reloc,
> > +				   struct intel_context *ctx)
> 
> Hmm. That's a nuisance. But no, you only use it to automatically create
> a fence not to patch the batch, so you can just use an object-flag.
> 
> This fits neatly into requests.

Most definitely.  What do you think of the potential upside in the DDX
for this, assuming we get dword writes from batches working?
Chris Wilson Sept. 3, 2014, 4:08 p.m. UTC | #3
On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> On Wed, 3 Sep 2014 08:01:55 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > These commands are illegal/invalid inside the object, only valid inside
> > the ring.
> 
> Hm, we ought to be able to write to no privileged space with
> STORE_DWORD, but that does mean moving to context specific pages in
> process space, or at least adding them to our existing scheme.

The per-process context page also doesn't exist generically. I certainly
hope that userspace can't overwrite the hws! Imagine if we were using
that for interrupt status reads, or seqno tracking...
 
> I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
> a non-privileged batch that nixes one of the other neat features we
> could have (fine grained intra-batch userspace synchronization).

I don't understand how writing the operation into the batch is
beneficial vs writing into the ring, unless you instended to use
something more fine grained than the batch seqno. You want to get
interrupts from inside batches? Rather than continue the existing scheme
of splitting up batches between fences?

I definitely think we should think twice before allowing userspace to
arbitrarily generate interrupts.

> > > +	return 0;
> > > +}
> > >  
> > >  static int
> > >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> > >  static int
> > >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> > >  				   struct eb_vmas *eb,
> > > -				   struct drm_i915_gem_relocation_entry *reloc)
> > > +				   struct drm_i915_gem_relocation_entry *reloc,
> > > +				   struct intel_context *ctx)
> > 
> > Hmm. That's a nuisance. But no, you only use it to automatically create
> > a fence not to patch the batch, so you can just use an object-flag.
> > 
> > This fits neatly into requests.
> 
> Most definitely.  What do you think of the potential upside in the DDX
> for this, assuming we get dword writes from batches working?

Negative. You now have relocation overhead, you still need to split
batches to keep the gpu busy and do ring switches, and context switching
between clients, so I don't feel a need for fences from inside a batch.

Getting seqno and a hws in the client would be nice, but if it continues
to require kernel polling, no thanks, I'll just still to approximately
tracking the active state of surfaces with the heavier accurate queries
sparingly.

About the only thing I could see as being useful is that it would allow
you to reuse a batch buffer multiple times, rather than overallocate a
whole page and keep a pool of such pages.

I am missing something?
-Chris
Jesse Barnes Sept. 3, 2014, 7:01 p.m. UTC | #4
On Wed, 3 Sep 2014 17:08:53 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> > On Wed, 3 Sep 2014 08:01:55 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > 
> > > These commands are illegal/invalid inside the object, only valid inside
> > > the ring.
> > 
> > Hm, we ought to be able to write to no privileged space with
> > STORE_DWORD, but that does mean moving to context specific pages in
> > process space, or at least adding them to our existing scheme.
> 
> The per-process context page also doesn't exist generically. I certainly
> hope that userspace can't overwrite the hws! Imagine if we were using
> that for interrupt status reads, or seqno tracking...

Yeah I'm thinking of an additional hws that's per-context and userspace
mappable.  It could come in handy for userspace only sync stuff.

>  
> > I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
> > a non-privileged batch that nixes one of the other neat features we
> > could have (fine grained intra-batch userspace synchronization).
> 
> I don't understand how writing the operation into the batch is
> beneficial vs writing into the ring, unless you instended to use
> something more fine grained than the batch seqno. You want to get
> interrupts from inside batches? Rather than continue the existing scheme
> of splitting up batches between fences?

Yeah, the whole idea here was to avoid flushing batches in order to
emit fences, both to avoid overhead and give userspace more rope.

> 
> I definitely think we should think twice before allowing userspace to
> arbitrarily generate interrupts.
> 
> > > > +	return 0;
> > > > +}
> > > >  
> > > >  static int
> > > >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> > > >  static int
> > > >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> > > >  				   struct eb_vmas *eb,
> > > > -				   struct drm_i915_gem_relocation_entry *reloc)
> > > > +				   struct drm_i915_gem_relocation_entry *reloc,
> > > > +				   struct intel_context *ctx)
> > > 
> > > Hmm. That's a nuisance. But no, you only use it to automatically create
> > > a fence not to patch the batch, so you can just use an object-flag.
> > > 
> > > This fits neatly into requests.
> > 
> > Most definitely.  What do you think of the potential upside in the DDX
> > for this, assuming we get dword writes from batches working?
> 
> Negative. You now have relocation overhead, you still need to split
> batches to keep the gpu busy and do ring switches, and context switching
> between clients, so I don't feel a need for fences from inside a batch.
> 
> Getting seqno and a hws in the client would be nice, but if it continues
> to require kernel polling, no thanks, I'll just still to approximately
> tracking the active state of surfaces with the heavier accurate queries
> sparingly.
> 
> About the only thing I could see as being useful is that it would allow
> you to reuse a batch buffer multiple times, rather than overallocate a
> whole page and keep a pool of such pages.
> 
> I am missing something?

No I think that's about right.  The need for reloc processing is a
definite downside to this approach, but that could be solved with a new
interface, or by just allowing userspace to map/manage a hws.  The
downside there is that the resulting fences wouldn't be shareable.  But
requiring a flush for that is probably fine.
Daniel Vetter Sept. 3, 2014, 7:41 p.m. UTC | #5
On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Wed, 3 Sep 2014 17:08:53 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
>> > On Wed, 3 Sep 2014 08:01:55 +0100
>> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> >
>> > > These commands are illegal/invalid inside the object, only valid inside
>> > > the ring.
>> >
>> > Hm, we ought to be able to write to no privileged space with
>> > STORE_DWORD, but that does mean moving to context specific pages in
>> > process space, or at least adding them to our existing scheme.
>>
>> The per-process context page also doesn't exist generically. I certainly
>> hope that userspace can't overwrite the hws! Imagine if we were using
>> that for interrupt status reads, or seqno tracking...
>
> Yeah I'm thinking of an additional hws that's per-context and userspace
> mappable.  It could come in handy for userspace only sync stuff.

Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL
- lots of igt tests actually do that for correctness checks. So the
only thing really is interrupts, and I think for that we really want
the full request tracking machinery in the kernel (otherwise I fear
we'll have even more fun with lost/spurious interrupts since the hw
guys just seem to not be able to get that right). Which means a full
batch split.

I have no idea how that's supposed to work when userspace does direct
hardware submission. But that's kinda a good reason not to do that
anyway, and at least for now it looks like direct hw submission is for
opencl2 only with interop with other devices (where sync matters) not
a use-case. For interop with other processes the gpu can always do a
seqno write to some shared page. And busy-looping, but apparently
that's what people want for low-latency. Or at least what designers
seem to think people want ...
-Daniel
Jesse Barnes Sept. 3, 2014, 7:48 p.m. UTC | #6
On Wed, 3 Sep 2014 21:41:02 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > On Wed, 3 Sep 2014 17:08:53 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> >> > On Wed, 3 Sep 2014 08:01:55 +0100
> >> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >> >
> >> > > These commands are illegal/invalid inside the object, only valid inside
> >> > > the ring.
> >> >
> >> > Hm, we ought to be able to write to no privileged space with
> >> > STORE_DWORD, but that does mean moving to context specific pages in
> >> > process space, or at least adding them to our existing scheme.
> >>
> >> The per-process context page also doesn't exist generically. I certainly
> >> hope that userspace can't overwrite the hws! Imagine if we were using
> >> that for interrupt status reads, or seqno tracking...
> >
> > Yeah I'm thinking of an additional hws that's per-context and userspace
> > mappable.  It could come in handy for userspace only sync stuff.
> 
> Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL
> - lots of igt tests actually do that for correctness checks. So the
> only thing really is interrupts, and I think for that we really want
> the full request tracking machinery in the kernel (otherwise I fear
> we'll have even more fun with lost/spurious interrupts since the hw
> guys just seem to not be able to get that right). Which means a full
> batch split.
> 
> I have no idea how that's supposed to work when userspace does direct
> hardware submission. But that's kinda a good reason not to do that
> anyway, and at least for now it looks like direct hw submission is for
> opencl2 only with interop with other devices (where sync matters) not
> a use-case. For interop with other processes the gpu can always do a
> seqno write to some shared page. And busy-looping, but apparently
> that's what people want for low-latency. Or at least what designers
> seem to think people want ...

Yeah I haven't thought how direct submission will work in terms of
IPC.  It may just have to be done in userland with a custom cooperative
mechanism...
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6eb119e..410eedf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2284,6 +2284,10 @@  int i915_sync_init(struct drm_i915_private *dev_priv);
 void i915_sync_fini(struct drm_i915_private *dev_priv);
 int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file);
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno);
+
 
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 60998fc..32ec599 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@ 
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "../../../staging/android/sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -262,6 +263,67 @@  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 		!obj->map_and_fenceable ||
 		obj->cache_level != I915_CACHE_NONE);
 }
+static int
+emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	uint32_t page_offset = offset_in_page(reloc->offset);
+	char *vaddr;
+	int ret;
+
+	ret = i915_gem_object_set_to_cpu_domain(obj, true);
+	if (ret)
+		return ret;
+
+	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
+				reloc->offset >> PAGE_SHIFT));
+	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
+	*(uint32_t *)(vaddr + page_offset + 4) =
+		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*(uint32_t *)(vaddr + page_offset + 8) =
+		obj->ring->outstanding_lazy_seqno;
+	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
+
+	kunmap_atomic(vaddr);
+
+	return 0;
+}
+
+static int
+emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	struct drm_device *dev = obj->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t __iomem *reloc_entry;
+	void __iomem *reloc_page;
+	int ret;
+
+	ret = i915_gem_object_set_to_gtt_domain(obj, true);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_object_put_fence(obj);
+	if (ret)
+		return ret;
+
+	/* Map the page containing the relocation we're going to perform.  */
+	reloc->offset += i915_gem_obj_ggtt_offset(obj);
+	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
+			reloc->offset & PAGE_MASK);
+
+	reloc_entry = (uint32_t __iomem *)
+		(reloc_page + offset_in_page(reloc->offset));
+	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
+	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
+		  reloc_entry);
+	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
+	iowrite32(MI_USER_INTERRUPT, reloc_entry);
+
+	io_mapping_unmap_atomic(reloc_page);
+
+	return 0;
+}
 
 static int
 relocate_entry_cpu(struct drm_i915_gem_object *obj,
@@ -349,7 +411,8 @@  relocate_entry_gtt(struct drm_i915_gem_object *obj,
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc)
+				   struct drm_i915_gem_relocation_entry *reloc,
+				   struct intel_context *ctx)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_gem_object *target_obj;
@@ -433,23 +496,39 @@  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	if (obj->active && in_atomic())
 		return -EFAULT;
 
-	if (use_cpu_reloc(obj))
-		ret = relocate_entry_cpu(obj, reloc, target_offset);
-	else
-		ret = relocate_entry_gtt(obj, reloc, target_offset);
+	if (reloc->write_domain & I915_GEM_DOMAIN_SYNC_OBJ) {
+		int fd;
+
+		/* get a new seqno */
+		intel_ring_begin(obj->ring, 0);
+
+		if (use_cpu_reloc(obj))
+			ret = emit_sync_obj_cpu(obj, reloc);
+		else
+			ret = emit_sync_obj_gtt(obj, reloc);
+
+		fd = i915_sync_fence_create(obj->ring, ctx,
+					    obj->ring->outstanding_lazy_seqno);
+		reloc->presumed_offset = fd;
+	} else {
+		if (use_cpu_reloc(obj))
+			ret = relocate_entry_cpu(obj, reloc, target_offset);
+		else
+			ret = relocate_entry_gtt(obj, reloc, target_offset);
+		/* and update the user's relocation entry */
+		reloc->presumed_offset = target_offset;
+	}
 
 	if (ret)
 		return ret;
 
-	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-
 	return 0;
 }
 
 static int
 i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
-				 struct eb_vmas *eb)
+				 struct eb_vmas *eb,
+				 struct intel_context *ctx)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
@@ -473,7 +552,7 @@  i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
+			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, ctx);
 			if (ret)
 				return ret;
 
@@ -496,13 +575,14 @@  i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 static int
 i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 				      struct eb_vmas *eb,
-				      struct drm_i915_gem_relocation_entry *relocs)
+				      struct drm_i915_gem_relocation_entry *relocs,
+				      struct intel_context *ctx)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int i, ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], ctx);
 		if (ret)
 			return ret;
 	}
@@ -511,7 +591,7 @@  i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 }
 
 static int
-i915_gem_execbuffer_relocate(struct eb_vmas *eb)
+i915_gem_execbuffer_relocate(struct eb_vmas *eb, struct intel_context *ctx)
 {
 	struct i915_vma *vma;
 	int ret = 0;
@@ -525,7 +605,7 @@  i915_gem_execbuffer_relocate(struct eb_vmas *eb)
 	 */
 	pagefault_disable();
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = i915_gem_execbuffer_relocate_vma(vma, eb);
+		ret = i915_gem_execbuffer_relocate_vma(vma, eb, ctx);
 		if (ret)
 			break;
 	}
@@ -664,6 +744,13 @@  i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			obj->tiling_mode != I915_TILING_NONE;
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
+		/*
+		 * If we're emitting a sync obj, we always need a reloc
+		 * pass to write the seqno.
+		 */
+		if (entry->flags & EXEC_OBJECT_SYNC_OBJ)
+			*need_relocs = true;
+
 		if (need_mappable)
 			list_move(&vma->exec_list, &ordered_vmas);
 		else
@@ -734,7 +821,8 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 				  struct drm_file *file,
 				  struct intel_engine_cs *ring,
 				  struct eb_vmas *eb,
-				  struct drm_i915_gem_exec_object2 *exec)
+				  struct drm_i915_gem_exec_object2 *exec,
+				  struct intel_context *ctx)
 {
 	struct drm_i915_gem_relocation_entry *reloc;
 	struct i915_address_space *vm;
@@ -830,7 +918,7 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		int offset = vma->exec_entry - exec;
 		ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb,
-							    reloc + reloc_offset[offset]);
+							    reloc + reloc_offset[offset], ctx);
 		if (ret)
 			goto err;
 	}
@@ -1340,17 +1428,18 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+
 	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs);
 	if (ret)
 		goto err;
 
 	/* The objects are in their final locations, apply the relocations. */
 	if (need_relocs)
-		ret = i915_gem_execbuffer_relocate(eb);
+		ret = i915_gem_execbuffer_relocate(eb, ctx);
 	if (ret) {
 		if (ret == -EFAULT) {
 			ret = i915_gem_execbuffer_relocate_slow(dev, args, file, ring,
-								eb, exec);
+								eb, exec, ctx);
 			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 		}
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
index 4938616..bd54fca 100644
--- a/drivers/gpu/drm/i915/i915_sync.c
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -195,32 +195,72 @@  static struct fence_ops i915_fence_ops = {
 	.timeline_value_str =	i915_fence_timeline_value_str,
 };
 
-static struct fence *i915_fence_create(struct intel_engine_cs *ring,
-				       struct intel_context *ctx)
+static struct i915_fence *__i915_fence_create(struct intel_engine_cs *ring,
+					      struct intel_context *ctx,
+					      u32 seqno)
 {
 	struct i915_fence *fence;
-	int ret;
 
 	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
 	if (!fence)
 		return NULL;
 
+	fence->ring = ring;
+	fence->ctx = ctx;
+	fence->seqno = ring->outstanding_lazy_seqno;
+	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
+		   fence->seqno);
+
+	return fence;
+}
+
+static struct fence *i915_fence_create(struct intel_engine_cs *ring,
+				       struct intel_context *ctx)
+{
+	struct i915_fence *fence;
+	int ret;
+
 	ret = ring->add_request(ring);
 	if (ret) {
 		DRM_ERROR("add_request failed\n");
-		fence_free((struct fence *)fence);
 		return NULL;
 	}
 
-	fence->ring = ring;
-	fence->ctx = ctx;
-	fence->seqno = ring->outstanding_lazy_seqno;
-	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
-		   fence->seqno);
+	fence = __i915_fence_create(ring, ctx, ring->outstanding_lazy_seqno);
 
 	return &fence->base;
 }
 
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno)
+{
+	struct i915_fence *fence;
+	struct sync_fence *sfence;
+	char name[64];
+	int fd = get_unused_fd_flags(O_CLOEXEC);
+
+	fence = __i915_fence_create(ring, ctx, seqno);
+	if (!fence) {
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	snprintf(name, sizeof(name), "0x%08x:0x%08x",
+		 ctx->user_handle, seqno);
+	sfence = sync_fence_create_dma(name, &fence->base);
+	if (!sfence) {
+		fence_free((struct fence *)fence);
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	sync_fence_install(sfence, fd);
+
+err:
+	return fd;
+}
+
 /**
  * i915_sync_create_fence_ioctl - fence creation function
  * @dev: drm device
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 65bd271..edadab2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -585,6 +585,8 @@  struct drm_i915_gem_relocation_entry {
 #define I915_GEM_DOMAIN_VERTEX		0x00000020
 /** GTT domain - aperture and scanout */
 #define I915_GEM_DOMAIN_GTT		0x00000040
+/** Sync object - special for inline fences */
+#define I915_GEM_DOMAIN_SYNC_OBJ	0x00000080
 /** @} */
 
 struct drm_i915_gem_exec_object {
@@ -661,10 +663,11 @@  struct drm_i915_gem_exec_object2 {
 	 */
 	__u64 offset;
 
-#define EXEC_OBJECT_NEEDS_FENCE (1<<0)
-#define EXEC_OBJECT_NEEDS_GTT	(1<<1)
-#define EXEC_OBJECT_WRITE	(1<<2)
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
+#define EXEC_OBJECT_NEEDS_FENCE (1<<0) /* requires fence regsiter */
+#define EXEC_OBJECT_NEEDS_GTT	(1<<1) /* needs global GTT mapping */
+#define EXEC_OBJECT_WRITE	(1<<2) /* object will be written */
+#define EXEC_OBJECT_SYNC_OBJ	(1<<3) /* emit a sync obj instead */
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SYNC_OBJ<<1)
 	__u64 flags;
 
 	__u64 rsvd1;