[RFC,v2] drm/i915: Android native sync support
diff mbox

Message ID 1422011594-22095-1-git-send-email-tvrtko.ursulin@linux.intel.com
State New, archived
Headers show

Commit Message

Tvrtko Ursulin Jan. 23, 2015, 11:13 a.m. UTC
From: Jesse Barnes <jbarnes@virtuousgeek.org>

Add Android native sync support with fences exported as file descriptors via
the execbuf ioctl (rsvd2 field is used).

This is a continuation of Jesse Barnes's previous work, squashed to arrive at
the final destination, cleaned up, with some fixes and preliminary light
testing.

GEM requests are extended with fence structures which are associated with
Android sync fences exported to user space via file descriptors. Fences which
are waited upon, and while exported to userspace, are referenced and added to
the irq_queue so they are signalled when requests are completed. There is no
overhead apart from the where fences are not requested.

Based on patches by Jesse Barnes:
   drm/i915: Android sync points for i915 v3
   drm/i915: add fences to the request struct
   drm/i915: sync fence fixes/updates

To do:
   * Extend driver data with context id / ring id (TBD).

v2:
   * Code review comments. (Chris Wilson)
   * ring->add_request() was a wrong thing to call - rebase on top of John
     Harrison's (drm/i915: Early alloc request) to ensure correct request is
     present before creating a fence.
   * Take a request reference from signalling path as well to ensure request
     sticks around while fence is on the request completion wait queue.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/Kconfig               |  14 ++
 drivers/gpu/drm/i915/Makefile              |   1 +
 drivers/gpu/drm/i915/i915_drv.h            |  25 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  24 +++
 drivers/gpu/drm/i915/i915_sync.c           | 241 +++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h                |   8 +-
 6 files changed, 310 insertions(+), 3 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_sync.c

Comments

Chris Wilson Jan. 23, 2015, 11:27 a.m. UTC | #1
On Fri, Jan 23, 2015 at 11:13:14AM +0000, Tvrtko Ursulin wrote:
> From: Jesse Barnes <jbarnes@virtuousgeek.org>
> 
> Add Android native sync support with fences exported as file descriptors via
> the execbuf ioctl (rsvd2 field is used).
> 
> This is a continuation of Jesse Barnes's previous work, squashed to arrive at
> the final destination, cleaned up, with some fixes and preliminary light
> testing.
> 
> GEM requests are extended with fence structures which are associated with
> Android sync fences exported to user space via file descriptors. Fences which
> are waited upon, and while exported to userspace, are referenced and added to
> the irq_queue so they are signalled when requests are completed. There is no
> overhead apart from the where fences are not requested.
> 
> Based on patches by Jesse Barnes:
>    drm/i915: Android sync points for i915 v3
>    drm/i915: add fences to the request struct
>    drm/i915: sync fence fixes/updates
> 
> To do:
>    * Extend driver data with context id / ring id (TBD).
> 
> v2:
>    * Code review comments. (Chris Wilson)
>    * ring->add_request() was a wrong thing to call - rebase on top of John
>      Harrison's (drm/i915: Early alloc request) to ensure correct request is
>      present before creating a fence.
>    * Take a request reference from signalling path as well to ensure request
>      sticks around while fence is on the request completion wait queue.

Ok, in this arrangement, attaching a fence to the execbuf is rather meh
as it is just a special cased version of attaching a fence to a bo.
-Chris
Tvrtko Ursulin Jan. 23, 2015, 2:02 p.m. UTC | #2
On 01/23/2015 11:27 AM, Chris Wilson wrote:
> On Fri, Jan 23, 2015 at 11:13:14AM +0000, Tvrtko Ursulin wrote:
>> From: Jesse Barnes <jbarnes@virtuousgeek.org>
>>
>> Add Android native sync support with fences exported as file descriptors via
>> the execbuf ioctl (rsvd2 field is used).
>>
>> This is a continuation of Jesse Barnes's previous work, squashed to arrive at
>> the final destination, cleaned up, with some fixes and preliminary light
>> testing.
>>
>> GEM requests are extended with fence structures which are associated with
>> Android sync fences exported to user space via file descriptors. Fences which
>> are waited upon, and while exported to userspace, are referenced and added to
>> the irq_queue so they are signalled when requests are completed. There is no
>> overhead apart from the where fences are not requested.
>>
>> Based on patches by Jesse Barnes:
>>     drm/i915: Android sync points for i915 v3
>>     drm/i915: add fences to the request struct
>>     drm/i915: sync fence fixes/updates
>>
>> To do:
>>     * Extend driver data with context id / ring id (TBD).
>>
>> v2:
>>     * Code review comments. (Chris Wilson)
>>     * ring->add_request() was a wrong thing to call - rebase on top of John
>>       Harrison's (drm/i915: Early alloc request) to ensure correct request is
>>       present before creating a fence.
>>     * Take a request reference from signalling path as well to ensure request
>>       sticks around while fence is on the request completion wait queue.
>
> Ok, in this arrangement, attaching a fence to the execbuf is rather meh
> as it is just a special cased version of attaching a fence to a bo.

Better meh than "no"! :D

My understanding is this is what people want, with the future input 
fence extension (scheduler).

Anyway.. v2 is broken since it unreferences requests without holding the 
mutex, more so from irq context so I need to rework that a bit.

Regards,

Tvrtko
Daniel Vetter Jan. 23, 2015, 3:53 p.m. UTC | #3
On Fri, Jan 23, 2015 at 02:02:44PM +0000, Tvrtko Ursulin wrote:
> 
> On 01/23/2015 11:27 AM, Chris Wilson wrote:
> >On Fri, Jan 23, 2015 at 11:13:14AM +0000, Tvrtko Ursulin wrote:
> >>From: Jesse Barnes <jbarnes@virtuousgeek.org>
> >>
> >>Add Android native sync support with fences exported as file descriptors via
> >>the execbuf ioctl (rsvd2 field is used).
> >>
> >>This is a continuation of Jesse Barnes's previous work, squashed to arrive at
> >>the final destination, cleaned up, with some fixes and preliminary light
> >>testing.
> >>
> >>GEM requests are extended with fence structures which are associated with
> >>Android sync fences exported to user space via file descriptors. Fences which
> >>are waited upon, and while exported to userspace, are referenced and added to
> >>the irq_queue so they are signalled when requests are completed. There is no
> >>overhead apart from the where fences are not requested.
> >>
> >>Based on patches by Jesse Barnes:
> >>    drm/i915: Android sync points for i915 v3
> >>    drm/i915: add fences to the request struct
> >>    drm/i915: sync fence fixes/updates
> >>
> >>To do:
> >>    * Extend driver data with context id / ring id (TBD).
> >>
> >>v2:
> >>    * Code review comments. (Chris Wilson)
> >>    * ring->add_request() was a wrong thing to call - rebase on top of John
> >>      Harrison's (drm/i915: Early alloc request) to ensure correct request is
> >>      present before creating a fence.
> >>    * Take a request reference from signalling path as well to ensure request
> >>      sticks around while fence is on the request completion wait queue.
> >
> >Ok, in this arrangement, attaching a fence to the execbuf is rather meh
> >as it is just a special cased version of attaching a fence to a bo.
> 
> Better meh than "no"! :D
> 
> My understanding is this is what people want, with the future input fence
> extension (scheduler).
> 
> Anyway.. v2 is broken since it unreferences requests without holding the
> mutex, more so from irq context so I need to rework that a bit.

Yeah that's kind the big behaviour difference (at least as I see it)
between explicit sync and implicit sync:
- with implicit sync the kernel attachs sync points/requests to buffers
  and userspace just asks about idle/business of buffers. Synchronization
  between different users is all handled behind userspace's back in the
  kernel.

- explicit sync attaches sync points to individual bits of work and makes
  them explicit objects userspace can get at and pass around. Userspace
  uses these separate things to inquire about when something is
  done/idle/busy and has its own mapping between explicit sync objects and
  the different pieces of memory affected by each. Synchronization between
  different clients is handled explicitly by passing sync objects around
  each time some rendering is done.

The bigger driver for explicit sync (besides "nvidia likes it sooooo much
that everyone uses it a lot") seems to be a) shitty gpu drivers without
proper bo managers (*cough*android*cough*) and svm, where there's simply
no buffer objects any more to attach sync information to.

So attaching explicit sync objects to buffers doesn't make that much
sense by default. Except when we want to mix explicit and implicit
userspace, then we need to grab sync objects out of dma-bufs and attach
random sync objects to them at the border. Iirc Maarten had dma-buf
patches for exactly this.
-Daniel
Tvrtko Ursulin Jan. 23, 2015, 4:49 p.m. UTC | #4
On 01/23/2015 03:53 PM, Daniel Vetter wrote:
> On Fri, Jan 23, 2015 at 02:02:44PM +0000, Tvrtko Ursulin wrote:
>>
>> On 01/23/2015 11:27 AM, Chris Wilson wrote:
>>> On Fri, Jan 23, 2015 at 11:13:14AM +0000, Tvrtko Ursulin wrote:
>>>> From: Jesse Barnes <jbarnes@virtuousgeek.org>
>>>>
>>>> Add Android native sync support with fences exported as file descriptors via
>>>> the execbuf ioctl (rsvd2 field is used).
>>>>
>>>> This is a continuation of Jesse Barnes's previous work, squashed to arrive at
>>>> the final destination, cleaned up, with some fixes and preliminary light
>>>> testing.
>>>>
>>>> GEM requests are extended with fence structures which are associated with
>>>> Android sync fences exported to user space via file descriptors. Fences which
>>>> are waited upon, and while exported to userspace, are referenced and added to
>>>> the irq_queue so they are signalled when requests are completed. There is no
>>>> overhead apart from the where fences are not requested.
>>>>
>>>> Based on patches by Jesse Barnes:
>>>>     drm/i915: Android sync points for i915 v3
>>>>     drm/i915: add fences to the request struct
>>>>     drm/i915: sync fence fixes/updates
>>>>
>>>> To do:
>>>>     * Extend driver data with context id / ring id (TBD).
>>>>
>>>> v2:
>>>>     * Code review comments. (Chris Wilson)
>>>>     * ring->add_request() was a wrong thing to call - rebase on top of John
>>>>       Harrison's (drm/i915: Early alloc request) to ensure correct request is
>>>>       present before creating a fence.
>>>>     * Take a request reference from signalling path as well to ensure request
>>>>       sticks around while fence is on the request completion wait queue.
>>>
>>> Ok, in this arrangement, attaching a fence to the execbuf is rather meh
>>> as it is just a special cased version of attaching a fence to a bo.
>>
>> Better meh than "no"! :D
>>
>> My understanding is this is what people want, with the future input fence
>> extension (scheduler).
>>
>> Anyway.. v2 is broken since it unreferences requests without holding the
>> mutex, more so from irq context so I need to rework that a bit.
>
> Yeah that's kind the big behaviour difference (at least as I see it)
> between explicit sync and implicit sync:
> - with implicit sync the kernel attachs sync points/requests to buffers
>    and userspace just asks about idle/business of buffers. Synchronization
>    between different users is all handled behind userspace's back in the
>    kernel.
>
> - explicit sync attaches sync points to individual bits of work and makes
>    them explicit objects userspace can get at and pass around. Userspace
>    uses these separate things to inquire about when something is
>    done/idle/busy and has its own mapping between explicit sync objects and
>    the different pieces of memory affected by each. Synchronization between
>    different clients is handled explicitly by passing sync objects around
>    each time some rendering is done.
>
> The bigger driver for explicit sync (besides "nvidia likes it sooooo much
> that everyone uses it a lot") seems to be a) shitty gpu drivers without
> proper bo managers (*cough*android*cough*) and svm, where there's simply
> no buffer objects any more to attach sync information to.
>
> So attaching explicit sync objects to buffers doesn't make that much
> sense by default. Except when we want to mix explicit and implicit
> userspace, then we need to grab sync objects out of dma-bufs and attach
> random sync objects to them at the border. Iirc Maarten had dma-buf
> patches for exactly this.

Could you please translate this into something understandable by 
newcomers? :)

In the context of Jesse's work I took over - there my impressions was 
you were mostly happy just needed converting from dedicated ioctl to 
execbuf?

Then how does future scheduling come into picture in your views above?

Submitting something with an input fence meaning "execute this work when 
this fence has been done" - when mixing and matching fences from 
different sources etc. How do you envisage that to work in implicit world?

Regards,

Tvrtko
Chris Wilson Jan. 23, 2015, 5:30 p.m. UTC | #5
On Fri, Jan 23, 2015 at 04:53:48PM +0100, Daniel Vetter wrote:
> Yeah that's kind the big behaviour difference (at least as I see it)
> between explicit sync and implicit sync:
> - with implicit sync the kernel attachs sync points/requests to buffers
>   and userspace just asks about idle/business of buffers. Synchronization
>   between different users is all handled behind userspace's back in the
>   kernel.
> 
> - explicit sync attaches sync points to individual bits of work and makes
>   them explicit objects userspace can get at and pass around. Userspace
>   uses these separate things to inquire about when something is
>   done/idle/busy and has its own mapping between explicit sync objects and
>   the different pieces of memory affected by each. Synchronization between
>   different clients is handled explicitly by passing sync objects around
>   each time some rendering is done.
> 
> The bigger driver for explicit sync (besides "nvidia likes it sooooo much
> that everyone uses it a lot") seems to be a) shitty gpu drivers without
> proper bo managers (*cough*android*cough*) and svm, where there's simply
> no buffer objects any more to attach sync information to.

Actually, mesa would really like much finer granularity than at batch
boundaries. Having a sync object for a batch boundary itself is very meh
and not a substantive improvement on what it possible today, but being
able to convert the implicit sync into an explicit fence object is
interesting and lends a layer of abstraction that could make it more
versatile. Most importantly, it allows me to defer the overhead of fence
creation until I actually want to sleep on a completion. Also Jesse
originally supporting inserting fences inside a batch, which looked
interesting if impractical.
-Chris
Daniel Vetter Jan. 24, 2015, 9:41 a.m. UTC | #6
On Fri, Jan 23, 2015 at 6:30 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Fri, Jan 23, 2015 at 04:53:48PM +0100, Daniel Vetter wrote:
>> Yeah that's kind the big behaviour difference (at least as I see it)
>> between explicit sync and implicit sync:
>> - with implicit sync the kernel attachs sync points/requests to buffers
>>   and userspace just asks about idle/business of buffers. Synchronization
>>   between different users is all handled behind userspace's back in the
>>   kernel.
>>
>> - explicit sync attaches sync points to individual bits of work and makes
>>   them explicit objects userspace can get at and pass around. Userspace
>>   uses these separate things to inquire about when something is
>>   done/idle/busy and has its own mapping between explicit sync objects and
>>   the different pieces of memory affected by each. Synchronization between
>>   different clients is handled explicitly by passing sync objects around
>>   each time some rendering is done.
>>
>> The bigger driver for explicit sync (besides "nvidia likes it sooooo much
>> that everyone uses it a lot") seems to be a) shitty gpu drivers without
>> proper bo managers (*cough*android*cough*) and svm, where there's simply
>> no buffer objects any more to attach sync information to.
>
> Actually, mesa would really like much finer granularity than at batch
> boundaries. Having a sync object for a batch boundary itself is very meh
> and not a substantive improvement on what it possible today, but being
> able to convert the implicit sync into an explicit fence object is
> interesting and lends a layer of abstraction that could make it more
> versatile. Most importantly, it allows me to defer the overhead of fence
> creation until I actually want to sleep on a completion. Also Jesse
> originally supporting inserting fences inside a batch, which looked
> interesting if impractical.

If want to allow the kernel to stall on fences (in e.g. the scheduler)
only the kernel should be allowed to create fences imo. At least
current fences assume that they _will_ signal eventually, and for i915
fences we have the hangcheck to ensure this is the case. In-batch
fences and lazy fence creation (beyond just delaying the fd allocation
to avoid too many fds flying around) is therefore a no-go.

For that kind of fine-grained sync between gpu and cpu workloads the
solutions thus far (at least what I've seen) is just busy-looping.
Usually those workloads have a few order more sync pionts than frames
we tend to render, so blocking isn't terrible efficient anyway.
-Daniel
Daniel Vetter Jan. 24, 2015, 9:47 a.m. UTC | #7
On Fri, Jan 23, 2015 at 5:49 PM, Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
> Could you please translate this into something understandable by newcomers?
> :)

I don't know which parts are confusing without questions so please ask
them ... the questions below about scheduler interactions seem fairly
advanced ;-)

> In the context of Jesse's work I took over - there my impressions was you
> were mostly happy just needed converting from dedicated ioctl to execbuf?

Yeah that's all we need to get started.

> Then how does future scheduling come into picture in your views above?

Scheduler just takes a pile of fences from the execbuf code and
doesn't care that much whether they come from implicit or explicit
syncing modes. Which means we'll always combine all the fences (both
implicit and explicit) when figuring out depencies.

> Submitting something with an input fence meaning "execute this work when
> this fence has been done" - when mixing and matching fences from different
> sources etc. How do you envisage that to work in implicit world?

execbuf always needs to obey the implicit fences since if it doesn't
our shrinker will start to unmap objects which are still in use. Not
good.

So if you want to do an execbuf with _only_ explicit syncing then you
need to lie about the write domains in your relocations: The kernel
allows parallel reads nowadays, so if you never set a write domain
there will be no implicit syncing (except with the shrinker, and as
explained that is mandatory anyway). Only write domains create
implicit sync depencies between batches, so if there's none you can
control all the depencies between batches with explicit fences.
-Daniel
Chris Wilson Jan. 24, 2015, 4:08 p.m. UTC | #8
On Sat, Jan 24, 2015 at 10:41:46AM +0100, Daniel Vetter wrote:
> On Fri, Jan 23, 2015 at 6:30 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > On Fri, Jan 23, 2015 at 04:53:48PM +0100, Daniel Vetter wrote:
> >> Yeah that's kind the big behaviour difference (at least as I see it)
> >> between explicit sync and implicit sync:
> >> - with implicit sync the kernel attachs sync points/requests to buffers
> >>   and userspace just asks about idle/business of buffers. Synchronization
> >>   between different users is all handled behind userspace's back in the
> >>   kernel.
> >>
> >> - explicit sync attaches sync points to individual bits of work and makes
> >>   them explicit objects userspace can get at and pass around. Userspace
> >>   uses these separate things to inquire about when something is
> >>   done/idle/busy and has its own mapping between explicit sync objects and
> >>   the different pieces of memory affected by each. Synchronization between
> >>   different clients is handled explicitly by passing sync objects around
> >>   each time some rendering is done.
> >>
> >> The bigger driver for explicit sync (besides "nvidia likes it sooooo much
> >> that everyone uses it a lot") seems to be a) shitty gpu drivers without
> >> proper bo managers (*cough*android*cough*) and svm, where there's simply
> >> no buffer objects any more to attach sync information to.
> >
> > Actually, mesa would really like much finer granularity than at batch
> > boundaries. Having a sync object for a batch boundary itself is very meh
> > and not a substantive improvement on what it possible today, but being
> > able to convert the implicit sync into an explicit fence object is
> > interesting and lends a layer of abstraction that could make it more
> > versatile. Most importantly, it allows me to defer the overhead of fence
> > creation until I actually want to sleep on a completion. Also Jesse
> > originally supporting inserting fences inside a batch, which looked
> > interesting if impractical.
> 
> If want to allow the kernel to stall on fences (in e.g. the scheduler)
> only the kernel should be allowed to create fences imo. At least
> current fences assume that they _will_ signal eventually, and for i915
> fences we have the hangcheck to ensure this is the case. In-batch
> fences and lazy fence creation (beyond just delaying the fd allocation
> to avoid too many fds flying around) is therefore a no-go.

Lazy fence creation (i.e. attaching a fence to a buffer) just means
creating the fd for an existing request (which is derived from the
fence). Or if the buffer is read or write idle, then you just create the
fence as already-signaled. And yes, it is just to avoid the death by a
thousand file descriptors and especially creating one every batch.

> For that kind of fine-grained sync between gpu and cpu workloads the
> solutions thus far (at least what I've seen) is just busy-looping.
> Usually those workloads have a few order more sync pionts than frames
> we tend to render, so blocking isn't terrible efficient anyway.

Heck, I think a full-fledged fence fd per batch is still more overhead
than I want.
-Chris
Daniel Vetter Jan. 26, 2015, 7:52 a.m. UTC | #9
On Sat, Jan 24, 2015 at 04:08:32PM +0000, Chris Wilson wrote:
> On Sat, Jan 24, 2015 at 10:41:46AM +0100, Daniel Vetter wrote:
> > On Fri, Jan 23, 2015 at 6:30 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > On Fri, Jan 23, 2015 at 04:53:48PM +0100, Daniel Vetter wrote:
> > >> Yeah that's kind the big behaviour difference (at least as I see it)
> > >> between explicit sync and implicit sync:
> > >> - with implicit sync the kernel attachs sync points/requests to buffers
> > >>   and userspace just asks about idle/business of buffers. Synchronization
> > >>   between different users is all handled behind userspace's back in the
> > >>   kernel.
> > >>
> > >> - explicit sync attaches sync points to individual bits of work and makes
> > >>   them explicit objects userspace can get at and pass around. Userspace
> > >>   uses these separate things to inquire about when something is
> > >>   done/idle/busy and has its own mapping between explicit sync objects and
> > >>   the different pieces of memory affected by each. Synchronization between
> > >>   different clients is handled explicitly by passing sync objects around
> > >>   each time some rendering is done.
> > >>
> > >> The bigger driver for explicit sync (besides "nvidia likes it sooooo much
> > >> that everyone uses it a lot") seems to be a) shitty gpu drivers without
> > >> proper bo managers (*cough*android*cough*) and svm, where there's simply
> > >> no buffer objects any more to attach sync information to.
> > >
> > > Actually, mesa would really like much finer granularity than at batch
> > > boundaries. Having a sync object for a batch boundary itself is very meh
> > > and not a substantive improvement on what it possible today, but being
> > > able to convert the implicit sync into an explicit fence object is
> > > interesting and lends a layer of abstraction that could make it more
> > > versatile. Most importantly, it allows me to defer the overhead of fence
> > > creation until I actually want to sleep on a completion. Also Jesse
> > > originally supporting inserting fences inside a batch, which looked
> > > interesting if impractical.
> > 
> > If want to allow the kernel to stall on fences (in e.g. the scheduler)
> > only the kernel should be allowed to create fences imo. At least
> > current fences assume that they _will_ signal eventually, and for i915
> > fences we have the hangcheck to ensure this is the case. In-batch
> > fences and lazy fence creation (beyond just delaying the fd allocation
> > to avoid too many fds flying around) is therefore a no-go.
> 
> Lazy fence creation (i.e. attaching a fence to a buffer) just means
> creating the fd for an existing request (which is derived from the
> fence). Or if the buffer is read or write idle, then you just create the
> fence as already-signaled. And yes, it is just to avoid the death by a
> thousand file descriptors and especially creating one every batch.

I think the problem will be platforms that want full explicit fence (like
android) but allow delayed creation of the fence fd from a gl sync object
(like the android egl extension allows).

I'm not sure yet how to best expose that really since just creating a
fence from the implicit request attached to the batch might upset the
interface purists with the mix in implicit and explicit fencing ;-) Hence
why I think for now we should just do the eager fd creation at execbuf
until ppl scream (well maybe not merge this patch until ppl scream ...).

> > For that kind of fine-grained sync between gpu and cpu workloads the
> > solutions thus far (at least what I've seen) is just busy-looping.
> > Usually those workloads have a few order more sync pionts than frames
> > we tend to render, so blocking isn't terrible efficient anyway.
> 
> Heck, I think a full-fledged fence fd per batch is still more overhead
> than I want.

One idea that crossed my mind is to expose the 2nd interrupt source to
userspace somehow (we have pipe_control/mi_flush_dw and
mi_user_interrupt). Then we could use that, maybe with some
wakeupfiltering to allow userspace to block a bit more efficient.

But my gut feel still says that most likely a bit of busy-looping won't
hurt in such a case with very fine-grained synchronization. There should
be a full blocking kernel request nearby. And often the check is for "busy
or not" only anyway, and that can already be done with seqno writes from
batchbuffers to a per-ctx bo that userspace manages.
-Daniel
Chris Wilson Jan. 26, 2015, 9:08 a.m. UTC | #10
On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> I think the problem will be platforms that want full explicit fence (like
> android) but allow delayed creation of the fence fd from a gl sync object
> (like the android egl extension allows).
> 
> I'm not sure yet how to best expose that really since just creating a
> fence from the implicit request attached to the batch might upset the
> interface purists with the mix in implicit and explicit fencing ;-) Hence
> why I think for now we should just do the eager fd creation at execbuf
> until ppl scream (well maybe not merge this patch until ppl scream ...).

Everything we do is buffer centric. Even in the future with random bits
of memory, we will still use buffers behind the scenes. From an
interface perspective, it is clearer to me if we say "give me a fence for
this buffer". Exactly the same way as we say "is this buffer busy" or
"wait on this buffer". The change is that we now hand back an fd to slot
into an event loop. That, to me, is a much smaller evolutionary step of
the existing API, and yet more versatile than just attaching one to the
execbuf.
-Chris
Tvrtko Ursulin Jan. 26, 2015, 11:08 a.m. UTC | #11
On 01/24/2015 09:47 AM, Daniel Vetter wrote:
> On Fri, Jan 23, 2015 at 5:49 PM, Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
>> Could you please translate this into something understandable by newcomers?
>> :)
>
> I don't know which parts are confusing without questions so please ask
> them ... the questions below about scheduler interactions seem fairly
> advanced ;-)

I suppose for starters, when I took over this work I read the last 
thread from December.  There you were saying to Jesse to convert the 
separate ioctl into execbuf interface for in & out fences.

That's what I did I thought. So is that not the fashion of the day any 
more or I misunderstood something?

People will scream very soon for something along these lines anyway 
since it is kind of packaged with the scheduler I am told and both will 
be very much desired soon.

Idea is to allow submitting of work and not block in userspace rather 
let the scheduler queue and shuffle depending on fences.

Regards,

Tvrtko
Daniel Vetter Jan. 28, 2015, 9:20 a.m. UTC | #12
On Mon, Jan 26, 2015 at 11:08:43AM +0000, Tvrtko Ursulin wrote:
> 
> On 01/24/2015 09:47 AM, Daniel Vetter wrote:
> >On Fri, Jan 23, 2015 at 5:49 PM, Tvrtko Ursulin
> ><tvrtko.ursulin@linux.intel.com> wrote:
> >>Could you please translate this into something understandable by newcomers?
> >>:)
> >
> >I don't know which parts are confusing without questions so please ask
> >them ... the questions below about scheduler interactions seem fairly
> >advanced ;-)
> 
> I suppose for starters, when I took over this work I read the last thread
> from December.  There you were saying to Jesse to convert the separate ioctl
> into execbuf interface for in & out fences.
> 
> That's what I did I thought. So is that not the fashion of the day any more
> or I misunderstood something?
> 
> People will scream very soon for something along these lines anyway since it
> is kind of packaged with the scheduler I am told and both will be very much
> desired soon.

That's still the proper fashion imo and what I've thought I've explained
...

About the screaming: The real blocker here is destaging android fences.
I've been telling this to everyone for a while already, but can't hurt to
reinforce ;-)

> Idea is to allow submitting of work and not block in userspace rather let
> the scheduler queue and shuffle depending on fences.

Well that's also possible without explicit fences, but not if your
userspace assumes explicit fences exist ofc.
-Daniel
Daniel Vetter Jan. 28, 2015, 9:22 a.m. UTC | #13
On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
> On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> > I think the problem will be platforms that want full explicit fence (like
> > android) but allow delayed creation of the fence fd from a gl sync object
> > (like the android egl extension allows).
> > 
> > I'm not sure yet how to best expose that really since just creating a
> > fence from the implicit request attached to the batch might upset the
> > interface purists with the mix in implicit and explicit fencing ;-) Hence
> > why I think for now we should just do the eager fd creation at execbuf
> > until ppl scream (well maybe not merge this patch until ppl scream ...).
> 
> Everything we do is buffer centric. Even in the future with random bits
> of memory, we will still use buffers behind the scenes. From an
> interface perspective, it is clearer to me if we say "give me a fence for
> this buffer". Exactly the same way as we say "is this buffer busy" or
> "wait on this buffer". The change is that we now hand back an fd to slot
> into an event loop. That, to me, is a much smaller evolutionary step of
> the existing API, and yet more versatile than just attaching one to the
> execbuf.

The problem is that big parts of the world do not subscribe to that buffer
centric view of gfx. Imo since those parts will be the primary users of
this interface we should try to fit their ideas as much as feasible. Later
on (if we need it) we can add some glue to tie in the buffer-centric
implicit model with the explicit model.
-Daniel
Chris Wilson Jan. 28, 2015, 9:23 a.m. UTC | #14
On Wed, Jan 28, 2015 at 10:22:15AM +0100, Daniel Vetter wrote:
> On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
> > On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> > > I think the problem will be platforms that want full explicit fence (like
> > > android) but allow delayed creation of the fence fd from a gl sync object
> > > (like the android egl extension allows).
> > > 
> > > I'm not sure yet how to best expose that really since just creating a
> > > fence from the implicit request attached to the batch might upset the
> > > interface purists with the mix in implicit and explicit fencing ;-) Hence
> > > why I think for now we should just do the eager fd creation at execbuf
> > > until ppl scream (well maybe not merge this patch until ppl scream ...).
> > 
> > Everything we do is buffer centric. Even in the future with random bits
> > of memory, we will still use buffers behind the scenes. From an
> > interface perspective, it is clearer to me if we say "give me a fence for
> > this buffer". Exactly the same way as we say "is this buffer busy" or
> > "wait on this buffer". The change is that we now hand back an fd to slot
> > into an event loop. That, to me, is a much smaller evolutionary step of
> > the existing API, and yet more versatile than just attaching one to the
> > execbuf.
> 
> The problem is that big parts of the world do not subscribe to that buffer
> centric view of gfx. Imo since those parts will be the primary users of
> this interface we should try to fit their ideas as much as feasible. Later
> on (if we need it) we can add some glue to tie in the buffer-centric
> implicit model with the explicit model.

They won't be using execbuffer either. So what's your point?
-Chris
Daniel Vetter Jan. 28, 2015, 9:50 a.m. UTC | #15
On Wed, Jan 28, 2015 at 09:23:46AM +0000, Chris Wilson wrote:
> On Wed, Jan 28, 2015 at 10:22:15AM +0100, Daniel Vetter wrote:
> > On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
> > > On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> > > > I think the problem will be platforms that want full explicit fence (like
> > > > android) but allow delayed creation of the fence fd from a gl sync object
> > > > (like the android egl extension allows).
> > > > 
> > > > I'm not sure yet how to best expose that really since just creating a
> > > > fence from the implicit request attached to the batch might upset the
> > > > interface purists with the mix in implicit and explicit fencing ;-) Hence
> > > > why I think for now we should just do the eager fd creation at execbuf
> > > > until ppl scream (well maybe not merge this patch until ppl scream ...).
> > > 
> > > Everything we do is buffer centric. Even in the future with random bits
> > > of memory, we will still use buffers behind the scenes. From an
> > > interface perspective, it is clearer to me if we say "give me a fence for
> > > this buffer". Exactly the same way as we say "is this buffer busy" or
> > > "wait on this buffer". The change is that we now hand back an fd to slot
> > > into an event loop. That, to me, is a much smaller evolutionary step of
> > > the existing API, and yet more versatile than just attaching one to the
> > > execbuf.
> > 
> > The problem is that big parts of the world do not subscribe to that buffer
> > centric view of gfx. Imo since those parts will be the primary users of
> > this interface we should try to fit their ideas as much as feasible. Later
> > on (if we need it) we can add some glue to tie in the buffer-centric
> > implicit model with the explicit model.
> 
> They won't be using execbuffer either. So what's your point?

Android very much will user execbuffer. And even the in-flight buffered
svm stuff will keep on using execbuf (just without any relocs).

And once we indeed can make the case (plus have the hw) for direct
userspace submission I think the kernel shouldn't be in the business of
creating fence objects: The ring will be fully under control of
userspace, and that's the only place we could insert a seqno into. So
again the same trust issues.
-Daniel
Chris Wilson Jan. 28, 2015, 10:07 a.m. UTC | #16
On Wed, Jan 28, 2015 at 10:50:18AM +0100, Daniel Vetter wrote:
> On Wed, Jan 28, 2015 at 09:23:46AM +0000, Chris Wilson wrote:
> > On Wed, Jan 28, 2015 at 10:22:15AM +0100, Daniel Vetter wrote:
> > > On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
> > > > On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> > > > > I think the problem will be platforms that want full explicit fence (like
> > > > > android) but allow delayed creation of the fence fd from a gl sync object
> > > > > (like the android egl extension allows).
> > > > > 
> > > > > I'm not sure yet how to best expose that really since just creating a
> > > > > fence from the implicit request attached to the batch might upset the
> > > > > interface purists with the mix in implicit and explicit fencing ;-) Hence
> > > > > why I think for now we should just do the eager fd creation at execbuf
> > > > > until ppl scream (well maybe not merge this patch until ppl scream ...).
> > > > 
> > > > Everything we do is buffer centric. Even in the future with random bits
> > > > of memory, we will still use buffers behind the scenes. From an
> > > > interface perspective, it is clearer to me if we say "give me a fence for
> > > > this buffer". Exactly the same way as we say "is this buffer busy" or
> > > > "wait on this buffer". The change is that we now hand back an fd to slot
> > > > into an event loop. That, to me, is a much smaller evolutionary step of
> > > > the existing API, and yet more versatile than just attaching one to the
> > > > execbuf.
> > > 
> > > The problem is that big parts of the world do not subscribe to that buffer
> > > centric view of gfx. Imo since those parts will be the primary users of
> > > this interface we should try to fit their ideas as much as feasible. Later
> > > on (if we need it) we can add some glue to tie in the buffer-centric
> > > implicit model with the explicit model.
> > 
> > They won't be using execbuffer either. So what's your point?
> 
> Android very much will user execbuffer. And even the in-flight buffered
> svm stuff will keep on using execbuf (just without any relocs).

So still buffer-centric and would benefit from the more general and more
explict fencing rather than just execbuf. If you look at the throttling
in mesa, you can already see a place that would rather create a fence on
a buffer rather than its very approximate execbuf tracking.
 
> And once we indeed can make the case (plus have the hw) for direct
> userspace submission I think the kernel shouldn't be in the business of
> creating fence objects: The ring will be fully under control of
> userspace, and that's the only place we could insert a seqno into. So
> again the same trust issues.

No buffers, no requests, nothing for the kernel to do. No impact on
designing an API that is useful today.
-Chris
Jesse Barnes Feb. 25, 2015, 8:46 p.m. UTC | #17
On 01/28/2015 02:07 AM, Chris Wilson wrote:
> On Wed, Jan 28, 2015 at 10:50:18AM +0100, Daniel Vetter wrote:
>> On Wed, Jan 28, 2015 at 09:23:46AM +0000, Chris Wilson wrote:
>>> On Wed, Jan 28, 2015 at 10:22:15AM +0100, Daniel Vetter wrote:
>>>> On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
>>>>> On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
>>>>>> I think the problem will be platforms that want full explicit fence (like
>>>>>> android) but allow delayed creation of the fence fd from a gl sync object
>>>>>> (like the android egl extension allows).
>>>>>>
>>>>>> I'm not sure yet how to best expose that really since just creating a
>>>>>> fence from the implicit request attached to the batch might upset the
>>>>>> interface purists with the mix in implicit and explicit fencing ;-) Hence
>>>>>> why I think for now we should just do the eager fd creation at execbuf
>>>>>> until ppl scream (well maybe not merge this patch until ppl scream ...).
>>>>>
>>>>> Everything we do is buffer centric. Even in the future with random bits
>>>>> of memory, we will still use buffers behind the scenes. From an
>>>>> interface perspective, it is clearer to me if we say "give me a fence for
>>>>> this buffer". Exactly the same way as we say "is this buffer busy" or
>>>>> "wait on this buffer". The change is that we now hand back an fd to slot
>>>>> into an event loop. That, to me, is a much smaller evolutionary step of
>>>>> the existing API, and yet more versatile than just attaching one to the
>>>>> execbuf.
>>>>
>>>> The problem is that big parts of the world do not subscribe to that buffer
>>>> centric view of gfx. Imo since those parts will be the primary users of
>>>> this interface we should try to fit their ideas as much as feasible. Later
>>>> on (if we need it) we can add some glue to tie in the buffer-centric
>>>> implicit model with the explicit model.
>>>
>>> They won't be using execbuffer either. So what's your point?
>>
>> Android very much will user execbuffer. And even the in-flight buffered
>> svm stuff will keep on using execbuf (just without any relocs).
> 
> So still buffer-centric and would benefit from the more general and more
> explict fencing rather than just execbuf. If you look at the throttling
> in mesa, you can already see a place that would rather create a fence on
> a buffer rather than its very approximate execbuf tracking.
>  
>> And once we indeed can make the case (plus have the hw) for direct
>> userspace submission I think the kernel shouldn't be in the business of
>> creating fence objects: The ring will be fully under control of
>> userspace, and that's the only place we could insert a seqno into. So
>> again the same trust issues.
> 
> No buffers, no requests, nothing for the kernel to do. No impact on
> designing an API that is useful today.

If Mesa really wants this, we should investigate intra-batch fences
again, both with and without buffer tracking (because afaik Mesa wants
bufferless SVM too).

But you said you think an fd is too heavyweight even?  What do you mean
by that?  Or were you just preferring re-use of an existing object (i.e.
the buffer) that we already track & pass around?

Jesse
Chris Wilson Feb. 26, 2015, 9:13 a.m. UTC | #18
On Wed, Feb 25, 2015 at 12:46:31PM -0800, Jesse Barnes wrote:
> On 01/28/2015 02:07 AM, Chris Wilson wrote:
> > On Wed, Jan 28, 2015 at 10:50:18AM +0100, Daniel Vetter wrote:
> >> On Wed, Jan 28, 2015 at 09:23:46AM +0000, Chris Wilson wrote:
> >>> On Wed, Jan 28, 2015 at 10:22:15AM +0100, Daniel Vetter wrote:
> >>>> On Mon, Jan 26, 2015 at 09:08:03AM +0000, Chris Wilson wrote:
> >>>>> On Mon, Jan 26, 2015 at 08:52:39AM +0100, Daniel Vetter wrote:
> >>>>>> I think the problem will be platforms that want full explicit fence (like
> >>>>>> android) but allow delayed creation of the fence fd from a gl sync object
> >>>>>> (like the android egl extension allows).
> >>>>>>
> >>>>>> I'm not sure yet how to best expose that really since just creating a
> >>>>>> fence from the implicit request attached to the batch might upset the
> >>>>>> interface purists with the mix in implicit and explicit fencing ;-) Hence
> >>>>>> why I think for now we should just do the eager fd creation at execbuf
> >>>>>> until ppl scream (well maybe not merge this patch until ppl scream ...).
> >>>>>
> >>>>> Everything we do is buffer centric. Even in the future with random bits
> >>>>> of memory, we will still use buffers behind the scenes. From an
> >>>>> interface perspective, it is clearer to me if we say "give me a fence for
> >>>>> this buffer". Exactly the same way as we say "is this buffer busy" or
> >>>>> "wait on this buffer". The change is that we now hand back an fd to slot
> >>>>> into an event loop. That, to me, is a much smaller evolutionary step of
> >>>>> the existing API, and yet more versatile than just attaching one to the
> >>>>> execbuf.
> >>>>
> >>>> The problem is that big parts of the world do not subscribe to that buffer
> >>>> centric view of gfx. Imo since those parts will be the primary users of
> >>>> this interface we should try to fit their ideas as much as feasible. Later
> >>>> on (if we need it) we can add some glue to tie in the buffer-centric
> >>>> implicit model with the explicit model.
> >>>
> >>> They won't be using execbuffer either. So what's your point?
> >>
> >> Android very much will user execbuffer. And even the in-flight buffered
> >> svm stuff will keep on using execbuf (just without any relocs).
> > 
> > So still buffer-centric and would benefit from the more general and more
> > explict fencing rather than just execbuf. If you look at the throttling
> > in mesa, you can already see a place that would rather create a fence on
> > a buffer rather than its very approximate execbuf tracking.
> >  
> >> And once we indeed can make the case (plus have the hw) for direct
> >> userspace submission I think the kernel shouldn't be in the business of
> >> creating fence objects: The ring will be fully under control of
> >> userspace, and that's the only place we could insert a seqno into. So
> >> again the same trust issues.
> > 
> > No buffers, no requests, nothing for the kernel to do. No impact on
> > designing an API that is useful today.
> 
> If Mesa really wants this, we should investigate intra-batch fences
> again, both with and without buffer tracking (because afaik Mesa wants
> bufferless SVM too).
> 
> But you said you think an fd is too heavyweight even?  What do you mean
> by that?  Or were you just preferring re-use of an existing object (i.e.
> the buffer) that we already track & pass around?

Mostly it is burn from X using select() and so we see fd handling very
high on the profiles when all X has to do is flip.

However, we can and do have up to several thousand batches in flight,
and many more pending retirement from userspace. That is a scary
prospect if I wanted to replace the signalling on the buffer with
individual fences, both from the scaling issue and running into resource
limits (i.e. back to the reason why our bo are currently fd-less).
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 4e39ab3..abafe68 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -43,6 +43,20 @@  config DRM_I915_KMS
 
 	  If in doubt, say "Y".
 
+config DRM_I915_SYNC
+	bool "Enable explicit sync support"
+	depends on DRM_I915
+	default y if STAGING
+	select STAGING
+	select ANDROID
+	select SYNC
+	help
+	  Choose this option to enable Android native sync support and the
+	  corresponding i915 driver code to expose it.  Slightly increases
+	  driver size and pulls in sync support from staging.
+
+	  If in doubt, say "Y".
+
 config DRM_I915_FBDEV
 	bool "Enable legacy fbdev support for the modesetting intel driver"
 	depends on DRM_I915
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 16e3dc3..bcff481 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -58,6 +58,7 @@  i915-y += intel_audio.o \
 	  intel_sprite.o
 i915-$(CONFIG_ACPI)		+= intel_acpi.o intel_opregion.o
 i915-$(CONFIG_DRM_I915_FBDEV)	+= intel_fbdev.o
+i915-$(CONFIG_DRM_I915_SYNC)	+= i915_sync.o
 
 # modesetting output/encoder code
 i915-y += dvo_ch7017.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 023f422..75729ab 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -49,6 +49,7 @@ 
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
+#include <linux/fence.h>
 
 /* General customization:
  */
@@ -2092,6 +2093,13 @@  struct drm_i915_gem_request {
 	struct list_head client_list;
 
 	uint32_t uniq;
+
+#ifdef CONFIG_DRM_I915_SYNC
+	/* core fence obj for this request, may be exported */
+	struct fence fence;
+
+	wait_queue_t wait;
+#endif
 };
 
 void i915_gem_request_free(struct kref *req_ref);
@@ -2583,6 +2591,23 @@  void i915_init_vm(struct drm_i915_private *dev_priv,
 void i915_gem_free_object(struct drm_gem_object *obj);
 void i915_gem_vma_destroy(struct i915_vma *vma);
 
+/* i915_sync.c */
+struct sync_fence;
+
+#ifdef CONFIG_DRM_I915_SYNC
+int i915_create_sync_fence_ring(struct intel_engine_cs *ring,
+				struct intel_context *ctx,
+				struct sync_fence **sync_fence, int *fence_fd);
+#else
+static inline
+int i915_create_sync_fence_ring(struct intel_engine_cs *ring,
+				struct intel_context *ctx,
+				struct sync_fence **sync_fence, int *fence_fd)
+{
+	return -ENODEV;
+}
+#endif
+
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
 #define PIN_GLOBAL 0x4
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2a3d1a9..b04157a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@ 
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "../../../staging/android/sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -1356,6 +1357,8 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	u32 flags;
 	int ret;
 	bool need_relocs;
+	struct sync_fence *sync_fence = NULL;
+	int fence_fd = -1;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1509,8 +1512,29 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto req_err;
 
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		ret = i915_create_sync_fence_ring(ring, ctx,
+						  &sync_fence, &fence_fd);
+		if (ret)
+			goto req_err;
+	}
+
 	ret = dev_priv->gt.do_execbuf(dev, file, ring, ctx, args,
 				      &eb->vmas, batch_obj, exec_start, flags);
+	if (ret)
+		goto exec_err;
+
+	if (sync_fence) {
+		sync_fence_install(sync_fence, fence_fd);
+		args->rsvd2 = fence_fd;
+		sync_fence = NULL;
+	}
+
+exec_err:
+	if (sync_fence) {
+		fput(sync_fence->file);
+		put_unused_fd(fence_fd);
+	}
 
 req_err:
 	/*
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
new file mode 100644
index 0000000..27c0c15
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -0,0 +1,241 @@ 
+/*
+ * Copyright © 2013-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jesse Barnes <jbarnes@virtuousgeek.org>
+ *    Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+ *
+ */
+
+#include <drm/drmP.h>
+#include <drm/drm_vma_manager.h>
+#include <drm/i915_drm.h>
+#include "i915_drv.h"
+#include "i915_trace.h"
+#include "intel_drv.h"
+#include <linux/oom.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include "../../../staging/android/sync.h"
+
+static DEFINE_SPINLOCK(fence_lock);
+
+/*
+ * i915 Android native sync fences.
+ *
+ * We implement sync points in terms of i915 seqnos. They're exposed through
+ * the DRM_IOCTL_I915_GEM_EXECBUFFER2 ioctl, and can be mixed and matched
+ * with other Android timelines and aggregated into sync_fences, etc.
+ *
+ * TODO:
+ *   * Display engine fences.
+ *   * Extend driver data with context id / ring id.
+ */
+
+#define to_i915_request(x) container_of(x, struct drm_i915_gem_request, fence)
+
+static const char *i915_fence_get_driver_name(struct fence *fence)
+{
+	return "i915";
+}
+
+static const char *i915_fence_ring_get_timeline_name(struct fence *fence)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+
+	return req->ring->name;
+}
+
+static int
+i915_fence_ring_check(wait_queue_t *wait, unsigned mode, int flags, void *key)
+{
+	struct drm_i915_gem_request *req = wait->private;
+	struct intel_engine_cs *ring = req->ring;
+
+	if (!i915_gem_request_completed(req, false))
+		return 0;
+
+	fence_signal(&req->fence);
+
+	__remove_wait_queue(&ring->irq_queue, wait);
+	i915_gem_request_unreference(req);
+	ring->irq_put(ring);
+
+	return 0;
+}
+
+static bool i915_fence_ring_enable_signaling(struct fence *fence)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+	struct intel_engine_cs *ring = req->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	wait_queue_t *wait = &req->wait;
+
+	/* queue fence wait queue on irq queue and get fence */
+	if (i915_gem_request_completed(req, false) ||
+	    i915_terminally_wedged(&dev_priv->gpu_error))
+		return false;
+
+	if (!ring->irq_get(ring))
+		return false;
+
+	if (i915_gem_request_completed(req, false)) {
+		ring->irq_put(ring);
+		return true;
+	}
+
+	wait->flags = 0;
+	wait->private = req;
+	wait->func = i915_fence_ring_check;
+
+	i915_gem_request_reference(req);
+
+	__add_wait_queue(&ring->irq_queue, wait);
+
+	return true;
+}
+
+static bool i915_fence_ring_signaled(struct fence *fence)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+
+	return i915_gem_request_completed(req, false);
+}
+
+static signed long
+i915_fence_ring_wait(struct fence *fence, bool intr, signed long timeout_js)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+	struct drm_i915_private *dev_priv = req->ring->dev->dev_private;
+	int ret;
+	s64 timeout;
+
+	timeout = jiffies_to_nsecs(timeout_js);
+
+	ret = __i915_wait_request(req,
+				  atomic_read(&dev_priv->gpu_error.reset_counter),
+				  intr, &timeout, NULL);
+	if (ret == -ETIME)
+		return 0;
+
+	return ret;
+}
+
+static int
+i915_fence_ring_fill_driver_data(struct fence *fence, void *data, int size)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+
+	if (size < sizeof(req->seqno))
+		return -ENOMEM;
+
+	memcpy(data, &req->seqno,
+	       sizeof(req->seqno));
+
+	return sizeof(req->seqno);
+}
+
+static void i915_fence_ring_value_str(struct fence *fence, char *str, int size)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+
+	snprintf(str, size, "%u", req->seqno);
+}
+
+static void
+i915_fence_ring_timeline_value_str(struct fence *fence, char *str, int size)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+	struct intel_engine_cs *ring = req->ring;
+
+	snprintf(str, size, "%u", ring->get_seqno(ring, false));
+}
+
+static void i915_fence_release(struct fence *fence)
+{
+	struct drm_i915_gem_request *req = to_i915_request(fence);
+	struct drm_device *dev = req->ring->dev;
+
+	mutex_lock(&dev->struct_mutex);
+	i915_gem_request_unreference(req);
+	mutex_unlock(&dev->struct_mutex);
+}
+
+static struct fence_ops i915_fence_ring_ops = {
+	.get_driver_name =	i915_fence_get_driver_name,
+	.get_timeline_name =	i915_fence_ring_get_timeline_name,
+	.enable_signaling =	i915_fence_ring_enable_signaling,
+	.signaled =		i915_fence_ring_signaled,
+	.wait =			i915_fence_ring_wait,
+	.fill_driver_data =	i915_fence_ring_fill_driver_data,
+	.fence_value_str =	i915_fence_ring_value_str,
+	.timeline_value_str =	i915_fence_ring_timeline_value_str,
+	.release =		i915_fence_release
+};
+
+int i915_create_sync_fence_ring(struct intel_engine_cs *ring,
+				struct intel_context *ctx,
+				struct sync_fence **sync_fence, int *fence_fd)
+{
+	struct drm_i915_gem_request *req;
+	struct sync_fence *sfence;
+	char ring_name[6] = "ring0";
+	int fd;
+
+	if (!intel_ring_initialized(ring)) {
+		DRM_DEBUG("Ring not initialized!\n");
+		return -EAGAIN;
+	}
+
+	req = ring->outstanding_lazy_request;
+	if (WARN_ON_ONCE(!req))
+		return -ECANCELED;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		DRM_DEBUG("No available file descriptor!\n");
+		return fd;
+	}
+
+	i915_gem_request_reference(req);
+
+	fence_init(&req->fence, &i915_fence_ring_ops, &fence_lock,
+		   ctx->user_handle, req->seqno);
+
+	ring_name[4] += ring->id;
+	sfence = sync_fence_create_dma(ring_name, &req->fence);
+	if (!sfence)
+		goto err;
+
+	*sync_fence = sfence;
+	*fence_fd = fd;
+
+	return 0;
+
+err:
+	i915_gem_request_unreference(req);
+	put_unused_fd(fd);
+	return -ENOMEM;
+}
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2e559f6e..c197770 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,7 +246,7 @@  typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
-#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -718,7 +718,7 @@  struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */
 	__u64 flags;
 	__u64 rsvd1; /* now used for context info */
-	__u64 rsvd2;
+	__u64 rsvd2; /* now used for fence fd */
 };
 
 /** Resets the SO write offset registers for transform feedback on gen7. */
@@ -750,7 +750,9 @@  struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_HANDLE_LUT		(1<<12)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_HANDLE_LUT<<1)
+#define I915_EXEC_FENCE_OUT		(1<<13)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_FENCE_OUT<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \