diff mbox

[RFC,9/9] drm/i915: Add sync framework support to execbuff IOCTL

Message ID 1437143483-6234-10-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison July 17, 2015, 2:31 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

Various projects desire a mechanism for managing dependencies between
work items asynchronously. This can also include work items across
complete different and independent systems. For example, an
application wants to retreive a frame from a video in device,
using it for rendering on a GPU then send it to the video out device
for display all without having to stall waiting for completion along
the way. The sync framework allows this. It encapsulates
synchronisation events in file descriptors. The application can
request a sync point for the completion of each piece of work. Drivers
should also take sync points in with each new work request and not
schedule the work to start until the sync has been signalled.

This patch adds sync framework support to the exec buffer IOCTL. A
sync point can be passed in to stall execution of the batch buffer
until signalled. And a sync point can be returned after each batch
buffer submission which will be signalled upon that batch buffer's
completion.

At present, the input sync point is simply waited on synchronously
inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
this will be handled asynchronously inside the scheduler and the IOCTL
can return without having to wait.

Note also that the scheduler will re-order the execution of batch
buffers, e.g. because a batch buffer is stalled on a sync point and
cannot be submitted yet but other, independent, batch buffers are
being presented to the driver. This means that the timeline within the
sync points returned cannot be global to the engine. Instead they must
be kept per context per engine (the scheduler may not re-order batches
within a context). Hence the timeline cannot be based on the existing
seqno values but must be a new implementation.

This patch is a port of work by several people that has been pulled
across from Android. It has been updated several times across several
patches. Rather than attempt to port each individual patch, this
version is the finished product as a single patch. The various
contributors/authors along the way (in addition to myself) were:
  Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
  Tvrtko Ursulin <tvrtko.ursulin@intel.com>
  Michel Thierry <michel.thierry@intel.com>
  Arun Siluvery <arun.siluvery@linux.intel.com>

[new patch in series]

For: VIZ-5190
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  6 ++
 drivers/gpu/drm/i915/i915_gem.c            | 84 ++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 90 ++++++++++++++++++++++++++++--
 include/uapi/drm/i915_drm.h                | 16 +++++-
 4 files changed, 188 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin July 27, 2015, 1 p.m. UTC | #1
On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> Various projects desire a mechanism for managing dependencies between
> work items asynchronously. This can also include work items across
> complete different and independent systems. For example, an
> application wants to retreive a frame from a video in device,
> using it for rendering on a GPU then send it to the video out device
> for display all without having to stall waiting for completion along
> the way. The sync framework allows this. It encapsulates
> synchronisation events in file descriptors. The application can
> request a sync point for the completion of each piece of work. Drivers
> should also take sync points in with each new work request and not
> schedule the work to start until the sync has been signalled.
>
> This patch adds sync framework support to the exec buffer IOCTL. A
> sync point can be passed in to stall execution of the batch buffer
> until signalled. And a sync point can be returned after each batch
> buffer submission which will be signalled upon that batch buffer's
> completion.
>
> At present, the input sync point is simply waited on synchronously
> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
> this will be handled asynchronously inside the scheduler and the IOCTL
> can return without having to wait.
>
> Note also that the scheduler will re-order the execution of batch
> buffers, e.g. because a batch buffer is stalled on a sync point and
> cannot be submitted yet but other, independent, batch buffers are
> being presented to the driver. This means that the timeline within the
> sync points returned cannot be global to the engine. Instead they must
> be kept per context per engine (the scheduler may not re-order batches
> within a context). Hence the timeline cannot be based on the existing
> seqno values but must be a new implementation.
>
> This patch is a port of work by several people that has been pulled
> across from Android. It has been updated several times across several
> patches. Rather than attempt to port each individual patch, this
> version is the finished product as a single patch. The various
> contributors/authors along the way (in addition to myself) were:
>    Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
>    Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>    Michel Thierry <michel.thierry@intel.com>
>    Arun Siluvery <arun.siluvery@linux.intel.com>
>
> [new patch in series]
>
> For: VIZ-5190
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h            |  6 ++
>   drivers/gpu/drm/i915/i915_gem.c            | 84 ++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 90 ++++++++++++++++++++++++++++--
>   include/uapi/drm/i915_drm.h                | 16 +++++-
>   4 files changed, 188 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d7f1aa5..cf6b7cd 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2168,6 +2168,7 @@ struct drm_i915_gem_request {
>   	struct list_head delay_free_list;
>   	bool cancelled;
>   	bool irq_enabled;
> +	bool fence_external;
>
>   	/** On Which ring this request was generated */
>   	struct drm_i915_private *i915;
> @@ -2252,6 +2253,11 @@ void i915_gem_request_notify(struct intel_engine_cs *ring);
>   int i915_create_fence_timeline(struct drm_device *dev,
>   			       struct intel_context *ctx,
>   			       struct intel_engine_cs *ring);
> +#ifdef CONFIG_SYNC
> +struct sync_fence;
> +int i915_create_sync_fence(struct drm_i915_gem_request *req, int *fence_fd);
> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *fence);
> +#endif
>
>   static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 3f20087..de93422 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -37,6 +37,9 @@
>   #include <linux/swap.h>
>   #include <linux/pci.h>
>   #include <linux/dma-buf.h>
> +#ifdef CONFIG_SYNC
> +#include <../drivers/staging/android/sync.h>
> +#endif
>
>   #define RQ_BUG_ON(expr)
>
> @@ -2549,6 +2552,15 @@ void __i915_add_request(struct drm_i915_gem_request *request,
>   	 */
>   	i915_gem_request_submit(request);
>
> +	/*
> +	 * If an external sync point has been requested for this request then
> +	 * it can be waited on without the driver's knowledge, i.e. without
> +	 * calling __i915_wait_request(). Thus interrupts must be enabled
> +	 * from the start rather than only on demand.
> +	 */
> +	if (request->fence_external)
> +		i915_gem_request_enable_interrupt(request);

Maybe then fence_exported would be clearer, fence_external at first 
sounds like it is coming from another driver or something.

> +
>   	if (i915.enable_execlists)
>   		ret = ring->emit_request(request);
>   	else {
> @@ -2857,6 +2869,78 @@ static uint32_t i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *t
>   	return seqno;
>   }
>
> +#ifdef CONFIG_SYNC
> +int i915_create_sync_fence(struct drm_i915_gem_request *req, int *fence_fd)
> +{
> +	char ring_name[] = "i915_ring0";
> +	struct sync_fence *sync_fence;
> +	int fd;
> +
> +	fd = get_unused_fd_flags(O_CLOEXEC);
> +	if (fd < 0) {
> +		DRM_DEBUG("No available file descriptors!\n");
> +		*fence_fd = -1;
> +		return fd;
> +	}
> +
> +	ring_name[9] += req->ring->id;
> +	sync_fence = sync_fence_create_dma(ring_name, &req->fence);

This will call ->enable_signalling so perhaps you could enable 
interrupts in there for exported fences. Maybe it would be a tiny bit 
more logically grouped. (Rather than have _add_request do it.)

> +	if (!sync_fence) {
> +		put_unused_fd(fd);
> +		*fence_fd = -1;
> +		return -ENOMEM;
> +	}
> +
> +	sync_fence_install(sync_fence, fd);
> +	*fence_fd = fd;
> +
> +	// Necessary??? Who does the put???
> +	fence_get(&req->fence);

sync_fence_release?

> +
> +	req->fence_external = true;
> +
> +	return 0;
> +}
> +
> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *sync_fence)
> +{
> +	struct fence *dma_fence;
> +	struct drm_i915_gem_request *req;
> +	bool ignore;
> +	int i;
> +
> +	if (atomic_read(&sync_fence->status) != 0)
> +		return true;
> +
> +	ignore = true;
> +	for(i = 0; i < sync_fence->num_fences; i++) {
> +		dma_fence = sync_fence->cbs[i].sync_pt;
> +
> +		/* No need to worry about dead points: */
> +		if (fence_is_signaled(dma_fence))
> +			continue;
> +
> +		/* Can't ignore other people's points: */
> +		if(dma_fence->ops != &i915_gem_request_fops) {
> +			ignore = false;
> +			break;

The same as return false and then don't need bool ignore at all.

> +		}
> +
> +		req = container_of(dma_fence, typeof(*req), fence);
> +
> +		/* Can't ignore points on other rings: */
> +		if (req->ring != ring) {
> +			ignore = false;
> +			break;
> +		}
> +
> +		/* Same ring means guaranteed to be in order so ignore it. */
> +	}
> +
> +	return ignore;
> +}
> +#endif
> +
>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
>   			   struct intel_context *ctx,
>   			   struct drm_i915_gem_request **req_out)
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 923a3c4..b1a1659 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -26,6 +26,7 @@
>    *
>    */
>
> +#include <linux/syscalls.h>
>   #include <drm/drmP.h>
>   #include <drm/i915_drm.h>
>   #include "i915_drv.h"
> @@ -33,6 +34,9 @@
>   #include "intel_drv.h"
>   #include <linux/dma_remapping.h>
>   #include <linux/uaccess.h>
> +#ifdef CONFIG_SYNC
> +#include <../drivers/staging/android/sync.h>
> +#endif
>
>   #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>   #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> @@ -1403,6 +1407,35 @@ eb_get_batch(struct eb_vmas *eb)
>   	return vma->obj;
>   }
>
> +#ifdef CONFIG_SYNC

I don't expect you'll be able to get away with ifdef's in the code like 
this so for non-RFC it will have to be cleaned up.

> +static int i915_early_fence_wait(struct intel_engine_cs *ring, int fence_fd)
> +{
> +	struct sync_fence *fence;
> +	int ret = 0;
> +
> +	if (fence_fd < 0) {
> +		DRM_ERROR("Invalid wait fence fd %d on ring %d\n", fence_fd,
> +			  (int) ring->id);
> +		return 1;
> +	}
> +
> +	fence = sync_fence_fdget(fence_fd);
> +	if (fence == NULL) {
> +		DRM_ERROR("Invalid wait fence %d on ring %d\n", fence_fd,
> +			  (int) ring->id);

These two should be DRM_DEBUG to prevent userspace from spamming the 
logs too easily.

> +		return 1;
> +	}
> +
> +	if (atomic_read(&fence->status) == 0) {
> +		if (!i915_safe_to_ignore_fence(ring, fence))
> +			ret = sync_fence_wait(fence, 1000);

I expect you have to wait indefinitely here, not just for one second.

> +	}
> +
> +	sync_fence_put(fence);
> +	return ret;
> +}
> +#endif
> +
>   static int
>   i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   		       struct drm_file *file,
> @@ -1422,6 +1455,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   	u32 dispatch_flags;
>   	int ret;
>   	bool need_relocs;
> +	int fd_fence_complete = -1;
> +#ifdef CONFIG_SYNC
> +	int fd_fence_wait = lower_32_bits(args->rsvd2);
> +#endif
> +
> +	/*
> +	 * Make sure an broken fence handle is not returned no matter
> +	 * how early an error might be hit. Note that rsvd2 has to be
> +	 * saved away first because it is also an input parameter!
> +	 */
> +	if (args->flags & I915_EXEC_CREATE_FENCE)
> +		args->rsvd2 = (__u64) -1;
>
>   	if (!i915_gem_check_execbuffer(args))
>   		return -EINVAL;
> @@ -1505,6 +1550,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   		dispatch_flags |= I915_DISPATCH_RS;
>   	}
>
> +#ifdef CONFIG_SYNC
> +	/*
> +	 * Without a GPU scheduler, any fence waits must be done up front.
> +	 */
> +	if (args->flags & I915_EXEC_WAIT_FENCE) {
> +		ret = i915_early_fence_wait(ring, fd_fence_wait);
> +		if (ret < 0)
> +			return ret;
> +
> +		args->flags &= ~I915_EXEC_WAIT_FENCE;
> +	}
> +#endif
> +
>   	intel_runtime_pm_get(dev_priv);
>
>   	ret = i915_mutex_lock_interruptible(dev);
> @@ -1652,6 +1710,27 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>   	params->batch_obj               = batch_obj;
>   	params->ctx                     = ctx;
>
> +#ifdef CONFIG_SYNC
> +	if (args->flags & I915_EXEC_CREATE_FENCE) {
> +		/*
> +		 * Caller has requested a sync fence.
> +		 * User interrupts will be enabled to make sure that
> +		 * the timeline is signalled on completion.
> +		 */
> +		ret = i915_create_sync_fence(params->request,
> +					     &fd_fence_complete);
> +		if (ret) {
> +			DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
> +				  ring->id, ctx);
> +			args->rsvd2 = (__u64) -1;
> +			goto err;
> +		}
> +
> +		/* Return the fence through the rsvd2 field */
> +		args->rsvd2 = (__u64) fd_fence_complete;
> +	}
> +#endif
> +
>   	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
>
>   err_batch_unpin:
> @@ -1683,6 +1762,12 @@ pre_mutex_err:
>   	/* intel_gpu_busy should also get a ref, so it will free when the device
>   	 * is really idle. */
>   	intel_runtime_pm_put(dev_priv);
> +
> +	if (fd_fence_complete != -1) {
> +		sys_close(fd_fence_complete);

I am not sure calling system call functions from driver code will be 
allowed. that's why I was doing fd_install only when sure everything 
went OK.

Regards,

Tvrtko
John Harrison Oct. 28, 2015, 1:01 p.m. UTC | #2
On 27/07/2015 14:00, Tvrtko Ursulin wrote:
> On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> Various projects desire a mechanism for managing dependencies between
>> work items asynchronously. This can also include work items across
>> complete different and independent systems. For example, an
>> application wants to retreive a frame from a video in device,
>> using it for rendering on a GPU then send it to the video out device
>> for display all without having to stall waiting for completion along
>> the way. The sync framework allows this. It encapsulates
>> synchronisation events in file descriptors. The application can
>> request a sync point for the completion of each piece of work. Drivers
>> should also take sync points in with each new work request and not
>> schedule the work to start until the sync has been signalled.
>>
>> This patch adds sync framework support to the exec buffer IOCTL. A
>> sync point can be passed in to stall execution of the batch buffer
>> until signalled. And a sync point can be returned after each batch
>> buffer submission which will be signalled upon that batch buffer's
>> completion.
>>
>> At present, the input sync point is simply waited on synchronously
>> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
>> this will be handled asynchronously inside the scheduler and the IOCTL
>> can return without having to wait.
>>
>> Note also that the scheduler will re-order the execution of batch
>> buffers, e.g. because a batch buffer is stalled on a sync point and
>> cannot be submitted yet but other, independent, batch buffers are
>> being presented to the driver. This means that the timeline within the
>> sync points returned cannot be global to the engine. Instead they must
>> be kept per context per engine (the scheduler may not re-order batches
>> within a context). Hence the timeline cannot be based on the existing
>> seqno values but must be a new implementation.
>>
>> This patch is a port of work by several people that has been pulled
>> across from Android. It has been updated several times across several
>> patches. Rather than attempt to port each individual patch, this
>> version is the finished product as a single patch. The various
>> contributors/authors along the way (in addition to myself) were:
>>    Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
>>    Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>    Michel Thierry <michel.thierry@intel.com>
>>    Arun Siluvery <arun.siluvery@linux.intel.com>
>>
>> [new patch in series]
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h            |  6 ++
>>   drivers/gpu/drm/i915/i915_gem.c            | 84 
>> ++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 90 
>> ++++++++++++++++++++++++++++--
>>   include/uapi/drm/i915_drm.h                | 16 +++++-
>>   4 files changed, 188 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index d7f1aa5..cf6b7cd 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2168,6 +2168,7 @@ struct drm_i915_gem_request {
>>       struct list_head delay_free_list;
>>       bool cancelled;
>>       bool irq_enabled;
>> +    bool fence_external;
>>
>>       /** On Which ring this request was generated */
>>       struct drm_i915_private *i915;
>> @@ -2252,6 +2253,11 @@ void i915_gem_request_notify(struct 
>> intel_engine_cs *ring);
>>   int i915_create_fence_timeline(struct drm_device *dev,
>>                      struct intel_context *ctx,
>>                      struct intel_engine_cs *ring);
>> +#ifdef CONFIG_SYNC
>> +struct sync_fence;
>> +int i915_create_sync_fence(struct drm_i915_gem_request *req, int 
>> *fence_fd);
>> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct 
>> sync_fence *fence);
>> +#endif
>>
>>   static inline bool i915_gem_request_completed(struct 
>> drm_i915_gem_request *req)
>>   {
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 3f20087..de93422 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -37,6 +37,9 @@
>>   #include <linux/swap.h>
>>   #include <linux/pci.h>
>>   #include <linux/dma-buf.h>
>> +#ifdef CONFIG_SYNC
>> +#include <../drivers/staging/android/sync.h>
>> +#endif
>>
>>   #define RQ_BUG_ON(expr)
>>
>> @@ -2549,6 +2552,15 @@ void __i915_add_request(struct 
>> drm_i915_gem_request *request,
>>        */
>>       i915_gem_request_submit(request);
>>
>> +    /*
>> +     * If an external sync point has been requested for this request 
>> then
>> +     * it can be waited on without the driver's knowledge, i.e. without
>> +     * calling __i915_wait_request(). Thus interrupts must be enabled
>> +     * from the start rather than only on demand.
>> +     */
>> +    if (request->fence_external)
>> +        i915_gem_request_enable_interrupt(request);
>
> Maybe then fence_exported would be clearer, fence_external at first 
> sounds like it is coming from another driver or something.
Turns out it is not necessary anyway as mentioned below.

>> +
>>       if (i915.enable_execlists)
>>           ret = ring->emit_request(request);
>>       else {
>> @@ -2857,6 +2869,78 @@ static uint32_t 
>> i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *t
>>       return seqno;
>>   }
>>
>> +#ifdef CONFIG_SYNC
>> +int i915_create_sync_fence(struct drm_i915_gem_request *req, int 
>> *fence_fd)
>> +{
>> +    char ring_name[] = "i915_ring0";
>> +    struct sync_fence *sync_fence;
>> +    int fd;
>> +
>> +    fd = get_unused_fd_flags(O_CLOEXEC);
>> +    if (fd < 0) {
>> +        DRM_DEBUG("No available file descriptors!\n");
>> +        *fence_fd = -1;
>> +        return fd;
>> +    }
>> +
>> +    ring_name[9] += req->ring->id;
>> +    sync_fence = sync_fence_create_dma(ring_name, &req->fence);
>
> This will call ->enable_signalling so perhaps you could enable 
> interrupts in there for exported fences. Maybe it would be a tiny bit 
> more logically grouped. (Rather than have _add_request do it.)

Yeah, hadn't quite spotted this first time around. It now all happens 
'magically' without needing any explicit code - just some explicit 
comments to say that the behind the scenes magick is a) happening and b) 
necessary.

>
>> +    if (!sync_fence) {
>> +        put_unused_fd(fd);
>> +        *fence_fd = -1;
>> +        return -ENOMEM;
>> +    }
>> +
>> +    sync_fence_install(sync_fence, fd);
>> +    *fence_fd = fd;
>> +
>> +    // Necessary??? Who does the put???
>> +    fence_get(&req->fence);
>
> sync_fence_release?
Yes but where? Does the driver need to call this? Is it userland's 
responsibility? Does it happen automatically when the fd is closed? Do 
we even need to do the _get() in the first place? It seems to be working 
in that I don't get any unexpected free of the fence and I don't get 
huge numbers of leaked fences. However, it would be nice to know how it 
is working!

>
>> +
>> +    req->fence_external = true;
>> +
>> +    return 0;
>> +}
>> +
>> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct 
>> sync_fence *sync_fence)
>> +{
>> +    struct fence *dma_fence;
>> +    struct drm_i915_gem_request *req;
>> +    bool ignore;
>> +    int i;
>> +
>> +    if (atomic_read(&sync_fence->status) != 0)
>> +        return true;
>> +
>> +    ignore = true;
>> +    for(i = 0; i < sync_fence->num_fences; i++) {
>> +        dma_fence = sync_fence->cbs[i].sync_pt;
>> +
>> +        /* No need to worry about dead points: */
>> +        if (fence_is_signaled(dma_fence))
>> +            continue;
>> +
>> +        /* Can't ignore other people's points: */
>> +        if(dma_fence->ops != &i915_gem_request_fops) {
>> +            ignore = false;
>> +            break;
>
> The same as return false and then don't need bool ignore at all.
Yeah, left over from when there was cleanup to be done at function exit 
time. The cleanup code removed but the single exit point did not.

>
>> +        }
>> +
>> +        req = container_of(dma_fence, typeof(*req), fence);
>> +
>> +        /* Can't ignore points on other rings: */
>> +        if (req->ring != ring) {
>> +            ignore = false;
>> +            break;
>> +        }
>> +
>> +        /* Same ring means guaranteed to be in order so ignore it. */
>> +    }
>> +
>> +    return ignore;
>> +}
>> +#endif
>> +
>>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
>>                  struct intel_context *ctx,
>>                  struct drm_i915_gem_request **req_out)
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
>> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index 923a3c4..b1a1659 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -26,6 +26,7 @@
>>    *
>>    */
>>
>> +#include <linux/syscalls.h>
>>   #include <drm/drmP.h>
>>   #include <drm/i915_drm.h>
>>   #include "i915_drv.h"
>> @@ -33,6 +34,9 @@
>>   #include "intel_drv.h"
>>   #include <linux/dma_remapping.h>
>>   #include <linux/uaccess.h>
>> +#ifdef CONFIG_SYNC
>> +#include <../drivers/staging/android/sync.h>
>> +#endif
>>
>>   #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>>   #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
>> @@ -1403,6 +1407,35 @@ eb_get_batch(struct eb_vmas *eb)
>>       return vma->obj;
>>   }
>>
>> +#ifdef CONFIG_SYNC
>
> I don't expect you'll be able to get away with ifdef's in the code 
> like this so for non-RFC it will have to be cleaned up.

Not a lot of choice at the moment. The sync_* code is all #ifdef 
CONFIG_SYNC so any code that references it must be likewise. As to how 
we get the CONFIG_SYNC tag removed, that is another discussion...

>
>> +static int i915_early_fence_wait(struct intel_engine_cs *ring, int 
>> fence_fd)
>> +{
>> +    struct sync_fence *fence;
>> +    int ret = 0;
>> +
>> +    if (fence_fd < 0) {
>> +        DRM_ERROR("Invalid wait fence fd %d on ring %d\n", fence_fd,
>> +              (int) ring->id);
>> +        return 1;
>> +    }
>> +
>> +    fence = sync_fence_fdget(fence_fd);
>> +    if (fence == NULL) {
>> +        DRM_ERROR("Invalid wait fence %d on ring %d\n", fence_fd,
>> +              (int) ring->id);
>
> These two should be DRM_DEBUG to prevent userspace from spamming the 
> logs too easily.
>
>> +        return 1;
>> +    }
>> +
>> +    if (atomic_read(&fence->status) == 0) {
>> +        if (!i915_safe_to_ignore_fence(ring, fence))
>> +            ret = sync_fence_wait(fence, 1000);
>
> I expect you have to wait indefinitely here, not just for one second.
Causing the driver to wait indefinitely under userland control is surely 
a Bad Thing(tm)? Okay, this is done before acquiring the mutex lock and 
presumably the wait can be interrupted, e.g. if the user land process 
gets a KILL signal. It still seems a bad idea to wait forever. Various 
bits of Android generally use a timeout of either 1s or 3s.

Daniel or anyone else, any views of driver time outs?

>
>> +    }
>> +
>> +    sync_fence_put(fence);
>> +    return ret;
>> +}
>> +#endif
>> +
>>   static int
>>   i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>                  struct drm_file *file,
>> @@ -1422,6 +1455,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       u32 dispatch_flags;
>>       int ret;
>>       bool need_relocs;
>> +    int fd_fence_complete = -1;
>> +#ifdef CONFIG_SYNC
>> +    int fd_fence_wait = lower_32_bits(args->rsvd2);
>> +#endif
>> +
>> +    /*
>> +     * Make sure an broken fence handle is not returned no matter
>> +     * how early an error might be hit. Note that rsvd2 has to be
>> +     * saved away first because it is also an input parameter!
>> +     */
>> +    if (args->flags & I915_EXEC_CREATE_FENCE)
>> +        args->rsvd2 = (__u64) -1;
>>
>>       if (!i915_gem_check_execbuffer(args))
>>           return -EINVAL;
>> @@ -1505,6 +1550,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>           dispatch_flags |= I915_DISPATCH_RS;
>>       }
>>
>> +#ifdef CONFIG_SYNC
>> +    /*
>> +     * Without a GPU scheduler, any fence waits must be done up front.
>> +     */
>> +    if (args->flags & I915_EXEC_WAIT_FENCE) {
>> +        ret = i915_early_fence_wait(ring, fd_fence_wait);
>> +        if (ret < 0)
>> +            return ret;
>> +
>> +        args->flags &= ~I915_EXEC_WAIT_FENCE;
>> +    }
>> +#endif
>> +
>>       intel_runtime_pm_get(dev_priv);
>>
>>       ret = i915_mutex_lock_interruptible(dev);
>> @@ -1652,6 +1710,27 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       params->batch_obj               = batch_obj;
>>       params->ctx                     = ctx;
>>
>> +#ifdef CONFIG_SYNC
>> +    if (args->flags & I915_EXEC_CREATE_FENCE) {
>> +        /*
>> +         * Caller has requested a sync fence.
>> +         * User interrupts will be enabled to make sure that
>> +         * the timeline is signalled on completion.
>> +         */
>> +        ret = i915_create_sync_fence(params->request,
>> +                         &fd_fence_complete);
>> +        if (ret) {
>> +            DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
>> +                  ring->id, ctx);
>> +            args->rsvd2 = (__u64) -1;
>> +            goto err;
>> +        }
>> +
>> +        /* Return the fence through the rsvd2 field */
>> +        args->rsvd2 = (__u64) fd_fence_complete;
>> +    }
>> +#endif
>> +
>>       ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
>>
>>   err_batch_unpin:
>> @@ -1683,6 +1762,12 @@ pre_mutex_err:
>>       /* intel_gpu_busy should also get a ref, so it will free when 
>> the device
>>        * is really idle. */
>>       intel_runtime_pm_put(dev_priv);
>> +
>> +    if (fd_fence_complete != -1) {
>> +        sys_close(fd_fence_complete);
>
> I am not sure calling system call functions from driver code will be 
> allowed. that's why I was doing fd_install only when sure everything 
> went OK.

Daniel or others, any thoughts? Is the clean up allowed in the driver? 
Is there an alternative driver friendly option? It makes the sync 
creating code cleaner if we can do everything in one place rather than 
do some processing up front and some at the end.

>
> Regards,
>
> Tvrtko
Tvrtko Ursulin Oct. 28, 2015, 2:31 p.m. UTC | #3
On 28/10/15 13:01, John Harrison wrote:
> On 27/07/2015 14:00, Tvrtko Ursulin wrote:

[snip]

>>> +    if (!sync_fence) {
>>> +        put_unused_fd(fd);
>>> +        *fence_fd = -1;
>>> +        return -ENOMEM;
>>> +    }
>>> +
>>> +    sync_fence_install(sync_fence, fd);
>>> +    *fence_fd = fd;
>>> +
>>> +    // Necessary??? Who does the put???
>>> +    fence_get(&req->fence);
>>
>> sync_fence_release?
> Yes but where? Does the driver need to call this? Is it userland's
> responsibility? Does it happen automatically when the fd is closed? Do
> we even need to do the _get() in the first place? It seems to be working
> in that I don't get any unexpected free of the fence and I don't get
> huge numbers of leaked fences. However, it would be nice to know how it
> is working!

When the fd is closed, implicitly or explicitly (close(2)/exit(2)/any 
process termination), kernel will automatically call sync_fence_release 
via the file_operations set in the inode->f_ops at sync_fence creation time

(Actually not when the fd is closed but when the last instance of struct 
file * in question goes away.)

>>> +        return 1;
>>> +    }
>>> +
>>> +    if (atomic_read(&fence->status) == 0) {
>>> +        if (!i915_safe_to_ignore_fence(ring, fence))
>>> +            ret = sync_fence_wait(fence, 1000);
>>
>> I expect you have to wait indefinitely here, not just for one second.
> Causing the driver to wait indefinitely under userland control is surely
> a Bad Thing(tm)? Okay, this is done before acquiring the mutex lock and
> presumably the wait can be interrupted, e.g. if the user land process
> gets a KILL signal. It still seems a bad idea to wait forever. Various
> bits of Android generally use a timeout of either 1s or 3s.
>
> Daniel or anyone else, any views of driver time outs?

It is slightly irrelevant since this is a temporary code path before the 
scheduler lands.

But I don't see it makes sense to have made up timeouts. If userspace 
gave us something to wait on, then I think we should wait until it is 
done or it fails. It is not blocking the driver but is running on the 
behalf of the same process which passed in the fd to wait on. So the 
process can only block itself.

Regards,

Tvrtko
Daniel Vetter Nov. 17, 2015, 1:59 p.m. UTC | #4
On Wed, Oct 28, 2015 at 01:01:17PM +0000, John Harrison wrote:
> On 27/07/2015 14:00, Tvrtko Ursulin wrote:
> >On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
> >>From: John Harrison <John.C.Harrison@Intel.com>
> >>
> >>Various projects desire a mechanism for managing dependencies between
> >>work items asynchronously. This can also include work items across
> >>complete different and independent systems. For example, an
> >>application wants to retreive a frame from a video in device,
> >>using it for rendering on a GPU then send it to the video out device
> >>for display all without having to stall waiting for completion along
> >>the way. The sync framework allows this. It encapsulates
> >>synchronisation events in file descriptors. The application can
> >>request a sync point for the completion of each piece of work. Drivers
> >>should also take sync points in with each new work request and not
> >>schedule the work to start until the sync has been signalled.
> >>
> >>This patch adds sync framework support to the exec buffer IOCTL. A
> >>sync point can be passed in to stall execution of the batch buffer
> >>until signalled. And a sync point can be returned after each batch
> >>buffer submission which will be signalled upon that batch buffer's
> >>completion.
> >>
> >>At present, the input sync point is simply waited on synchronously
> >>inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
> >>this will be handled asynchronously inside the scheduler and the IOCTL
> >>can return without having to wait.
> >>
> >>Note also that the scheduler will re-order the execution of batch
> >>buffers, e.g. because a batch buffer is stalled on a sync point and
> >>cannot be submitted yet but other, independent, batch buffers are
> >>being presented to the driver. This means that the timeline within the
> >>sync points returned cannot be global to the engine. Instead they must
> >>be kept per context per engine (the scheduler may not re-order batches
> >>within a context). Hence the timeline cannot be based on the existing
> >>seqno values but must be a new implementation.
> >>
> >>This patch is a port of work by several people that has been pulled
> >>across from Android. It has been updated several times across several
> >>patches. Rather than attempt to port each individual patch, this
> >>version is the finished product as a single patch. The various
> >>contributors/authors along the way (in addition to myself) were:
> >>   Satyanantha RamaGopal M <rama.gopal.m.satyanantha@intel.com>
> >>   Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>   Michel Thierry <michel.thierry@intel.com>
> >>   Arun Siluvery <arun.siluvery@linux.intel.com>
> >>
> >>[new patch in series]
> >>
> >>For: VIZ-5190
> >>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> >>---
> >>  drivers/gpu/drm/i915/i915_drv.h            |  6 ++
> >>  drivers/gpu/drm/i915/i915_gem.c            | 84
> >>++++++++++++++++++++++++++++
> >>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 90
> >>++++++++++++++++++++++++++++--
> >>  include/uapi/drm/i915_drm.h                | 16 +++++-
> >>  4 files changed, 188 insertions(+), 8 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/i915_drv.h
> >>b/drivers/gpu/drm/i915/i915_drv.h
> >>index d7f1aa5..cf6b7cd 100644
> >>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>@@ -2168,6 +2168,7 @@ struct drm_i915_gem_request {
> >>      struct list_head delay_free_list;
> >>      bool cancelled;
> >>      bool irq_enabled;
> >>+    bool fence_external;
> >>
> >>      /** On Which ring this request was generated */
> >>      struct drm_i915_private *i915;
> >>@@ -2252,6 +2253,11 @@ void i915_gem_request_notify(struct
> >>intel_engine_cs *ring);
> >>  int i915_create_fence_timeline(struct drm_device *dev,
> >>                     struct intel_context *ctx,
> >>                     struct intel_engine_cs *ring);
> >>+#ifdef CONFIG_SYNC
> >>+struct sync_fence;
> >>+int i915_create_sync_fence(struct drm_i915_gem_request *req, int
> >>*fence_fd);
> >>+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct
> >>sync_fence *fence);
> >>+#endif
> >>
> >>  static inline bool i915_gem_request_completed(struct
> >>drm_i915_gem_request *req)
> >>  {
> >>diff --git a/drivers/gpu/drm/i915/i915_gem.c
> >>b/drivers/gpu/drm/i915/i915_gem.c
> >>index 3f20087..de93422 100644
> >>--- a/drivers/gpu/drm/i915/i915_gem.c
> >>+++ b/drivers/gpu/drm/i915/i915_gem.c
> >>@@ -37,6 +37,9 @@
> >>  #include <linux/swap.h>
> >>  #include <linux/pci.h>
> >>  #include <linux/dma-buf.h>
> >>+#ifdef CONFIG_SYNC
> >>+#include <../drivers/staging/android/sync.h>
> >>+#endif
> >>
> >>  #define RQ_BUG_ON(expr)
> >>
> >>@@ -2549,6 +2552,15 @@ void __i915_add_request(struct
> >>drm_i915_gem_request *request,
> >>       */
> >>      i915_gem_request_submit(request);
> >>
> >>+    /*
> >>+     * If an external sync point has been requested for this request
> >>then
> >>+     * it can be waited on without the driver's knowledge, i.e. without
> >>+     * calling __i915_wait_request(). Thus interrupts must be enabled
> >>+     * from the start rather than only on demand.
> >>+     */
> >>+    if (request->fence_external)
> >>+        i915_gem_request_enable_interrupt(request);
> >
> >Maybe then fence_exported would be clearer, fence_external at first sounds
> >like it is coming from another driver or something.
> Turns out it is not necessary anyway as mentioned below.
> 
> >>+
> >>      if (i915.enable_execlists)
> >>          ret = ring->emit_request(request);
> >>      else {
> >>@@ -2857,6 +2869,78 @@ static uint32_t
> >>i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *t
> >>      return seqno;
> >>  }
> >>
> >>+#ifdef CONFIG_SYNC
> >>+int i915_create_sync_fence(struct drm_i915_gem_request *req, int
> >>*fence_fd)
> >>+{
> >>+    char ring_name[] = "i915_ring0";
> >>+    struct sync_fence *sync_fence;
> >>+    int fd;
> >>+
> >>+    fd = get_unused_fd_flags(O_CLOEXEC);
> >>+    if (fd < 0) {
> >>+        DRM_DEBUG("No available file descriptors!\n");
> >>+        *fence_fd = -1;
> >>+        return fd;
> >>+    }
> >>+
> >>+    ring_name[9] += req->ring->id;
> >>+    sync_fence = sync_fence_create_dma(ring_name, &req->fence);
> >
> >This will call ->enable_signalling so perhaps you could enable interrupts
> >in there for exported fences. Maybe it would be a tiny bit more logically
> >grouped. (Rather than have _add_request do it.)
> 
> Yeah, hadn't quite spotted this first time around. It now all happens
> 'magically' without needing any explicit code - just some explicit comments
> to say that the behind the scenes magick is a) happening and b) necessary.
> 
> >
> >>+    if (!sync_fence) {
> >>+        put_unused_fd(fd);
> >>+        *fence_fd = -1;
> >>+        return -ENOMEM;
> >>+    }
> >>+
> >>+    sync_fence_install(sync_fence, fd);
> >>+    *fence_fd = fd;
> >>+
> >>+    // Necessary??? Who does the put???
> >>+    fence_get(&req->fence);
> >
> >sync_fence_release?
> Yes but where? Does the driver need to call this? Is it userland's
> responsibility? Does it happen automatically when the fd is closed? Do we
> even need to do the _get() in the first place? It seems to be working in
> that I don't get any unexpected free of the fence and I don't get huge
> numbers of leaked fences. However, it would be nice to know how it is
> working!
> 
> >
> >>+
> >>+    req->fence_external = true;
> >>+
> >>+    return 0;
> >>+}
> >>+
> >>+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct
> >>sync_fence *sync_fence)
> >>+{
> >>+    struct fence *dma_fence;
> >>+    struct drm_i915_gem_request *req;
> >>+    bool ignore;
> >>+    int i;
> >>+
> >>+    if (atomic_read(&sync_fence->status) != 0)
> >>+        return true;
> >>+
> >>+    ignore = true;
> >>+    for(i = 0; i < sync_fence->num_fences; i++) {
> >>+        dma_fence = sync_fence->cbs[i].sync_pt;
> >>+
> >>+        /* No need to worry about dead points: */
> >>+        if (fence_is_signaled(dma_fence))
> >>+            continue;
> >>+
> >>+        /* Can't ignore other people's points: */
> >>+        if(dma_fence->ops != &i915_gem_request_fops) {
> >>+            ignore = false;
> >>+            break;
> >
> >The same as return false and then don't need bool ignore at all.
> Yeah, left over from when there was cleanup to be done at function exit
> time. The cleanup code removed but the single exit point did not.
> 
> >
> >>+        }
> >>+
> >>+        req = container_of(dma_fence, typeof(*req), fence);
> >>+
> >>+        /* Can't ignore points on other rings: */
> >>+        if (req->ring != ring) {
> >>+            ignore = false;
> >>+            break;
> >>+        }
> >>+
> >>+        /* Same ring means guaranteed to be in order so ignore it. */
> >>+    }
> >>+
> >>+    return ignore;
> >>+}
> >>+#endif
> >>+
> >>  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> >>                 struct intel_context *ctx,
> >>                 struct drm_i915_gem_request **req_out)
> >>diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >>b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >>index 923a3c4..b1a1659 100644
> >>--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >>+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> >>@@ -26,6 +26,7 @@
> >>   *
> >>   */
> >>
> >>+#include <linux/syscalls.h>
> >>  #include <drm/drmP.h>
> >>  #include <drm/i915_drm.h>
> >>  #include "i915_drv.h"
> >>@@ -33,6 +34,9 @@
> >>  #include "intel_drv.h"
> >>  #include <linux/dma_remapping.h>
> >>  #include <linux/uaccess.h>
> >>+#ifdef CONFIG_SYNC
> >>+#include <../drivers/staging/android/sync.h>
> >>+#endif
> >>
> >>  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
> >>  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> >>@@ -1403,6 +1407,35 @@ eb_get_batch(struct eb_vmas *eb)
> >>      return vma->obj;
> >>  }
> >>
> >>+#ifdef CONFIG_SYNC
> >
> >I don't expect you'll be able to get away with ifdef's in the code like
> >this so for non-RFC it will have to be cleaned up.
> 
> Not a lot of choice at the moment. The sync_* code is all #ifdef CONFIG_SYNC
> so any code that references it must be likewise. As to how we get the
> CONFIG_SYNC tag removed, that is another discussion...

Destaging the sync stuff is part of the merge criteria for this feature.
So yeah, all the #ifdefery has to go and be replace by a select FENCE or
whatever in the i915 Kconfig.

> >>+static int i915_early_fence_wait(struct intel_engine_cs *ring, int
> >>fence_fd)
> >>+{
> >>+    struct sync_fence *fence;
> >>+    int ret = 0;
> >>+
> >>+    if (fence_fd < 0) {
> >>+        DRM_ERROR("Invalid wait fence fd %d on ring %d\n", fence_fd,
> >>+              (int) ring->id);
> >>+        return 1;
> >>+    }
> >>+
> >>+    fence = sync_fence_fdget(fence_fd);
> >>+    if (fence == NULL) {
> >>+        DRM_ERROR("Invalid wait fence %d on ring %d\n", fence_fd,
> >>+              (int) ring->id);
> >
> >These two should be DRM_DEBUG to prevent userspace from spamming the logs
> >too easily.
> >
> >>+        return 1;
> >>+    }
> >>+
> >>+    if (atomic_read(&fence->status) == 0) {
> >>+        if (!i915_safe_to_ignore_fence(ring, fence))
> >>+            ret = sync_fence_wait(fence, 1000);
> >
> >I expect you have to wait indefinitely here, not just for one second.
> Causing the driver to wait indefinitely under userland control is surely a
> Bad Thing(tm)? Okay, this is done before acquiring the mutex lock and
> presumably the wait can be interrupted, e.g. if the user land process gets a
> KILL signal. It still seems a bad idea to wait forever. Various bits of
> Android generally use a timeout of either 1s or 3s.
> 
> Daniel or anyone else, any views of driver time outs?

Wait forever, but interruptibly. Have an igt to exercise the deadlock case
and make sure gpu reset can recover.

> >
> >>+    }
> >>+
> >>+    sync_fence_put(fence);
> >>+    return ret;
> >>+}
> >>+#endif
> >>+
> >>  static int
> >>  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >>                 struct drm_file *file,
> >>@@ -1422,6 +1455,18 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> >>void *data,
> >>      u32 dispatch_flags;
> >>      int ret;
> >>      bool need_relocs;
> >>+    int fd_fence_complete = -1;
> >>+#ifdef CONFIG_SYNC
> >>+    int fd_fence_wait = lower_32_bits(args->rsvd2);
> >>+#endif
> >>+
> >>+    /*
> >>+     * Make sure an broken fence handle is not returned no matter
> >>+     * how early an error might be hit. Note that rsvd2 has to be
> >>+     * saved away first because it is also an input parameter!
> >>+     */
> >>+    if (args->flags & I915_EXEC_CREATE_FENCE)
> >>+        args->rsvd2 = (__u64) -1;
> >>
> >>      if (!i915_gem_check_execbuffer(args))
> >>          return -EINVAL;
> >>@@ -1505,6 +1550,19 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> >>void *data,
> >>          dispatch_flags |= I915_DISPATCH_RS;
> >>      }
> >>
> >>+#ifdef CONFIG_SYNC
> >>+    /*
> >>+     * Without a GPU scheduler, any fence waits must be done up front.
> >>+     */
> >>+    if (args->flags & I915_EXEC_WAIT_FENCE) {
> >>+        ret = i915_early_fence_wait(ring, fd_fence_wait);
> >>+        if (ret < 0)
> >>+            return ret;
> >>+
> >>+        args->flags &= ~I915_EXEC_WAIT_FENCE;
> >>+    }
> >>+#endif
> >>+
> >>      intel_runtime_pm_get(dev_priv);
> >>
> >>      ret = i915_mutex_lock_interruptible(dev);
> >>@@ -1652,6 +1710,27 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> >>void *data,
> >>      params->batch_obj               = batch_obj;
> >>      params->ctx                     = ctx;
> >>
> >>+#ifdef CONFIG_SYNC
> >>+    if (args->flags & I915_EXEC_CREATE_FENCE) {
> >>+        /*
> >>+         * Caller has requested a sync fence.
> >>+         * User interrupts will be enabled to make sure that
> >>+         * the timeline is signalled on completion.
> >>+         */
> >>+        ret = i915_create_sync_fence(params->request,
> >>+                         &fd_fence_complete);
> >>+        if (ret) {
> >>+            DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
> >>+                  ring->id, ctx);
> >>+            args->rsvd2 = (__u64) -1;
> >>+            goto err;
> >>+        }
> >>+
> >>+        /* Return the fence through the rsvd2 field */
> >>+        args->rsvd2 = (__u64) fd_fence_complete;
> >>+    }
> >>+#endif
> >>+
> >>      ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
> >>
> >>  err_batch_unpin:
> >>@@ -1683,6 +1762,12 @@ pre_mutex_err:
> >>      /* intel_gpu_busy should also get a ref, so it will free when the
> >>device
> >>       * is really idle. */
> >>      intel_runtime_pm_put(dev_priv);
> >>+
> >>+    if (fd_fence_complete != -1) {
> >>+        sys_close(fd_fence_complete);
> >
> >I am not sure calling system call functions from driver code will be
> >allowed. that's why I was doing fd_install only when sure everything went
> >OK.
> 
> Daniel or others, any thoughts? Is the clean up allowed in the driver? Is
> there an alternative driver friendly option? It makes the sync creating code
> cleaner if we can do everything in one place rather than do some processing
> up front and some at the end.

You don't get to do this ;-) Fix it by only installing the fd if
everything has succeeded.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d7f1aa5..cf6b7cd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2168,6 +2168,7 @@  struct drm_i915_gem_request {
 	struct list_head delay_free_list;
 	bool cancelled;
 	bool irq_enabled;
+	bool fence_external;
 
 	/** On Which ring this request was generated */
 	struct drm_i915_private *i915;
@@ -2252,6 +2253,11 @@  void i915_gem_request_notify(struct intel_engine_cs *ring);
 int i915_create_fence_timeline(struct drm_device *dev,
 			       struct intel_context *ctx,
 			       struct intel_engine_cs *ring);
+#ifdef CONFIG_SYNC
+struct sync_fence;
+int i915_create_sync_fence(struct drm_i915_gem_request *req, int *fence_fd);
+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *fence);
+#endif
 
 static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3f20087..de93422 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -37,6 +37,9 @@ 
 #include <linux/swap.h>
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
+#ifdef CONFIG_SYNC
+#include <../drivers/staging/android/sync.h>
+#endif
 
 #define RQ_BUG_ON(expr)
 
@@ -2549,6 +2552,15 @@  void __i915_add_request(struct drm_i915_gem_request *request,
 	 */
 	i915_gem_request_submit(request);
 
+	/*
+	 * If an external sync point has been requested for this request then
+	 * it can be waited on without the driver's knowledge, i.e. without
+	 * calling __i915_wait_request(). Thus interrupts must be enabled
+	 * from the start rather than only on demand.
+	 */
+	if (request->fence_external)
+		i915_gem_request_enable_interrupt(request);
+
 	if (i915.enable_execlists)
 		ret = ring->emit_request(request);
 	else {
@@ -2857,6 +2869,78 @@  static uint32_t i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *t
 	return seqno;
 }
 
+#ifdef CONFIG_SYNC
+int i915_create_sync_fence(struct drm_i915_gem_request *req, int *fence_fd)
+{
+	char ring_name[] = "i915_ring0";
+	struct sync_fence *sync_fence;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		DRM_DEBUG("No available file descriptors!\n");
+		*fence_fd = -1;
+		return fd;
+	}
+
+	ring_name[9] += req->ring->id;
+	sync_fence = sync_fence_create_dma(ring_name, &req->fence);
+	if (!sync_fence) {
+		put_unused_fd(fd);
+		*fence_fd = -1;
+		return -ENOMEM;
+	}
+
+	sync_fence_install(sync_fence, fd);
+	*fence_fd = fd;
+
+	// Necessary??? Who does the put???
+	fence_get(&req->fence);
+
+	req->fence_external = true;
+
+	return 0;
+}
+
+bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct sync_fence *sync_fence)
+{
+	struct fence *dma_fence;
+	struct drm_i915_gem_request *req;
+	bool ignore;
+	int i;
+
+	if (atomic_read(&sync_fence->status) != 0)
+		return true;
+
+	ignore = true;
+	for(i = 0; i < sync_fence->num_fences; i++) {
+		dma_fence = sync_fence->cbs[i].sync_pt;
+
+		/* No need to worry about dead points: */
+		if (fence_is_signaled(dma_fence))
+			continue;
+
+		/* Can't ignore other people's points: */
+		if(dma_fence->ops != &i915_gem_request_fops) {
+			ignore = false;
+			break;
+		}
+
+		req = container_of(dma_fence, typeof(*req), fence);
+
+		/* Can't ignore points on other rings: */
+		if (req->ring != ring) {
+			ignore = false;
+			break;
+		}
+
+		/* Same ring means guaranteed to be in order so ignore it. */
+	}
+
+	return ignore;
+}
+#endif
+
 int i915_gem_request_alloc(struct intel_engine_cs *ring,
 			   struct intel_context *ctx,
 			   struct drm_i915_gem_request **req_out)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 923a3c4..b1a1659 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -26,6 +26,7 @@ 
  *
  */
 
+#include <linux/syscalls.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
@@ -33,6 +34,9 @@ 
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
 #include <linux/uaccess.h>
+#ifdef CONFIG_SYNC
+#include <../drivers/staging/android/sync.h>
+#endif
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -1403,6 +1407,35 @@  eb_get_batch(struct eb_vmas *eb)
 	return vma->obj;
 }
 
+#ifdef CONFIG_SYNC
+static int i915_early_fence_wait(struct intel_engine_cs *ring, int fence_fd)
+{
+	struct sync_fence *fence;
+	int ret = 0;
+
+	if (fence_fd < 0) {
+		DRM_ERROR("Invalid wait fence fd %d on ring %d\n", fence_fd,
+			  (int) ring->id);
+		return 1;
+	}
+
+	fence = sync_fence_fdget(fence_fd);
+	if (fence == NULL) {
+		DRM_ERROR("Invalid wait fence %d on ring %d\n", fence_fd,
+			  (int) ring->id);
+		return 1;
+	}
+
+	if (atomic_read(&fence->status) == 0) {
+		if (!i915_safe_to_ignore_fence(ring, fence))
+			ret = sync_fence_wait(fence, 1000);
+	}
+
+	sync_fence_put(fence);
+	return ret;
+}
+#endif
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1422,6 +1455,18 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	u32 dispatch_flags;
 	int ret;
 	bool need_relocs;
+	int fd_fence_complete = -1;
+#ifdef CONFIG_SYNC
+	int fd_fence_wait = lower_32_bits(args->rsvd2);
+#endif
+
+	/*
+	 * Make sure an broken fence handle is not returned no matter
+	 * how early an error might be hit. Note that rsvd2 has to be
+	 * saved away first because it is also an input parameter!
+	 */
+	if (args->flags & I915_EXEC_CREATE_FENCE)
+		args->rsvd2 = (__u64) -1;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1505,6 +1550,19 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+#ifdef CONFIG_SYNC
+	/*
+	 * Without a GPU scheduler, any fence waits must be done up front.
+	 */
+	if (args->flags & I915_EXEC_WAIT_FENCE) {
+		ret = i915_early_fence_wait(ring, fd_fence_wait);
+		if (ret < 0)
+			return ret;
+
+		args->flags &= ~I915_EXEC_WAIT_FENCE;
+	}
+#endif
+
 	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1652,6 +1710,27 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	params->batch_obj               = batch_obj;
 	params->ctx                     = ctx;
 
+#ifdef CONFIG_SYNC
+	if (args->flags & I915_EXEC_CREATE_FENCE) {
+		/*
+		 * Caller has requested a sync fence.
+		 * User interrupts will be enabled to make sure that
+		 * the timeline is signalled on completion.
+		 */
+		ret = i915_create_sync_fence(params->request,
+					     &fd_fence_complete);
+		if (ret) {
+			DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
+				  ring->id, ctx);
+			args->rsvd2 = (__u64) -1;
+			goto err;
+		}
+
+		/* Return the fence through the rsvd2 field */
+		args->rsvd2 = (__u64) fd_fence_complete;
+	}
+#endif
+
 	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
 
 err_batch_unpin:
@@ -1683,6 +1762,12 @@  pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+
+	if (fd_fence_complete != -1) {
+		sys_close(fd_fence_complete);
+		args->rsvd2 = (__u64) -1;
+	}
+
 	return ret;
 }
 
@@ -1788,11 +1873,6 @@  i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = kmalloc(sizeof(*exec2_list)*args->buffer_count,
 			     GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	if (exec2_list == NULL)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 192027b..9dbf67e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -250,7 +250,7 @@  typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
-#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -694,7 +694,7 @@  struct drm_i915_gem_exec_object2 {
 	__u64 flags;
 
 	__u64 rsvd1;
-	__u64 rsvd2;
+	__u64 rsvd2;	/* Used for fence fd */
 };
 
 struct drm_i915_gem_execbuffer2 {
@@ -775,7 +775,17 @@  struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/** Caller supplies a sync fence fd in the rsvd2 field.
+ * Wait for it to be signalled before starting the work
+ */
+#define I915_EXEC_WAIT_FENCE		(1<<16)
+
+/** Caller wants a sync fence fd for this execbuffer.
+ *  It will be returned in rsvd2
+ */
+#define I915_EXEC_CREATE_FENCE		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_CREATE_FENCE<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \