Message ID | 20190830144726.18291-10-lionel.g.landwerlin@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Vulkan performance query support | expand |
Quoting Lionel Landwerlin (2019-08-30 15:47:24) > +static int > +get_execbuf_oa_config(struct i915_execbuffer *eb) > +{ > + int err = 0; > + > + eb->perf_file = NULL; > + eb->oa_config = NULL; > + eb->oa_vma = NULL; > + eb->oa_bo = NULL; > + > + if ((eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) == 0) > + return 0; > + > + eb->perf_file = fget(eb->extensions.perf_config.perf_fd); > + if (!eb->perf_file) > + return -EINVAL; > + > + err = i915_mutex_lock_interruptible(&eb->i915->drm); > + if (err) { > + fput(eb->perf_file); > + eb->perf_file = NULL; > + return err; > + } > + > + if (eb->perf_file->private_data != eb->i915->perf.exclusive_stream) > + err = -EINVAL; > + > + mutex_unlock(&eb->i915->drm.struct_mutex); So why can i915->perf.execlusive_stream not change after this point? > + > + if (!err) { > + err = i915_perf_get_oa_config_and_bo( > + eb->i915->perf.exclusive_stream, > + eb->extensions.perf_config.oa_config, > + &eb->oa_config, &eb->oa_bo); > + } > + > + return err; > +} > + > static int __reloc_gpu_alloc(struct i915_execbuffer *eb, > struct i915_vma *vma, > unsigned int len) > @@ -2051,6 +2098,50 @@ add_to_client(struct i915_request *rq, struct drm_file *file) > spin_unlock(&file_priv->mm.lock); > } > > +static int eb_oa_config(struct i915_execbuffer *eb) > +{ > + struct i915_perf_stream *perf_stream; > + int err; > + > + if (!eb->oa_config) > + return 0; > + > + perf_stream = eb->perf_file->private_data; > + > + err = mutex_lock_interruptible(&perf_stream->config_mutex); > + if (err) > + return err; > + > + err = i915_active_request_set(&perf_stream->active_config_rq, > + eb->request); > + if (err) > + goto out; > + > + /* > + * If the config hasn't changed, skip reconfiguring the HW (this is > + * subject to a delay we want to avoid has much as possible). > + */ > + if (eb->oa_config == perf_stream->oa_config) > + goto out; > + We need to wait for any exclusive fences in case we migrate this object in future: i915_vma_lock(eb->oa_vma); err = i915_request_await_object(eb->request, eb->oa_vma->obj, false); /* await_resv already! */ if (err = 0) err = i915_vma_move_to_active(eb->oa_vma, eb->request, 0); i915_vma_unlock(eb->oa_vma); Though this raises an interesting point, we do not actually want to emit a semaphore here (albeit the engine is fixed atm) as we are after the point where we have declared all semaphore waits completed. (Not an issue to worry about right now!) > + if (err) > + goto out; > + > + err = eb->engine->emit_bb_start(eb->request, > + eb->oa_vma->node.start, > + 0, I915_DISPATCH_SECURE); > + if (err) > + goto out; > + > + swap(eb->oa_config, perf_stream->oa_config); > + > +out: > + mutex_unlock(&perf_stream->config_mutex); > + > + return err; > +} > + > static int eb_submit(struct i915_execbuffer *eb) > { > int err; > @@ -2077,6 +2168,10 @@ static int eb_submit(struct i915_execbuffer *eb) > return err; > } > > + err = eb_oa_config(eb); > + if (err) > + return err; Ok, definitely needs to be after the waits! > + > err = eb->engine->emit_bb_start(eb->request, > eb->batch->node.start + > eb->batch_start_offset, > @@ -2643,8 +2738,25 @@ static int parse_timeline_fences(struct i915_user_extension __user *ext, void *d > return 0; > } > > +static int parse_perf_config(struct i915_user_extension __user *ext, void *data) > +{ > + struct i915_execbuffer *eb = data; > + > + if (eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) > + return -EINVAL; > + > + if (copy_from_user(&eb->extensions.perf_config, ext, > + sizeof(eb->extensions.perf_config))) > + return -EFAULT; > + > + eb->extensions.flags |= BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF); > + > + return 0; > +} > + > static const i915_user_extension_fn execbuf_extensions[] = { > [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, > + [DRM_I915_GEM_EXECBUFFER_EXT_PERF] = parse_perf_config, > }; > > static int > @@ -2755,10 +2867,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, > } > } > > - err = eb_create(&eb); > + err = get_execbuf_oa_config(&eb); > if (err) > goto err_out_fence; > > + err = eb_create(&eb); > + if (err) > + goto err_oa_config; > + > GEM_BUG_ON(!eb.lut_size); > > err = eb_select_context(&eb); > @@ -2769,6 +2885,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, > if (unlikely(err)) > goto err_context; > > + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { > + struct i915_perf_stream *perf_stream = > + eb.perf_file->private_data; > + > + if (perf_stream->engine != eb.engine) { > + err = -EINVAL; > + goto err_engine; > + } > + } > + > err = i915_mutex_lock_interruptible(dev); > if (err) > goto err_engine; > @@ -2889,6 +3015,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, > } > } > > + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { > + eb.oa_vma = i915_vma_instance(eb.oa_bo, > + &eb.engine->i915->ggtt.vm, NULL); eb.engine->gt->ggtt.vm The i915_vma_instance() can be created outside of the struct_mutex, and once we have the oa_vma, we don't need to keep a oa_bo pointer. Later we can do the i915_vma_pin() before struct_mutex as well. Thanks, that's looking a better with regard the plan to eliminate struct_mutex. -Chris
On 30/08/2019 18:32, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-08-30 15:47:24) >> +static int >> +get_execbuf_oa_config(struct i915_execbuffer *eb) >> +{ >> + int err = 0; >> + >> + eb->perf_file = NULL; >> + eb->oa_config = NULL; >> + eb->oa_vma = NULL; >> + eb->oa_bo = NULL; >> + >> + if ((eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) == 0) >> + return 0; >> + >> + eb->perf_file = fget(eb->extensions.perf_config.perf_fd); >> + if (!eb->perf_file) >> + return -EINVAL; >> + >> + err = i915_mutex_lock_interruptible(&eb->i915->drm); >> + if (err) { >> + fput(eb->perf_file); >> + eb->perf_file = NULL; >> + return err; >> + } >> + >> + if (eb->perf_file->private_data != eb->i915->perf.exclusive_stream) >> + err = -EINVAL; >> + >> + mutex_unlock(&eb->i915->drm.struct_mutex); > So why can i915->perf.execlusive_stream not change after this point? It changes only when the last FD reference is dropped, so if we got it here there is at least one ref from the app doing the ioctl and we grab another ref on it which hold onto until we return. > >> + >> + if (!err) { >> + err = i915_perf_get_oa_config_and_bo( >> + eb->i915->perf.exclusive_stream, >> + eb->extensions.perf_config.oa_config, >> + &eb->oa_config, &eb->oa_bo); >> + } >> + >> + return err; >> +} >> + >> static int __reloc_gpu_alloc(struct i915_execbuffer *eb, >> struct i915_vma *vma, >> unsigned int len) >> @@ -2051,6 +2098,50 @@ add_to_client(struct i915_request *rq, struct drm_file *file) >> spin_unlock(&file_priv->mm.lock); >> } >> >> +static int eb_oa_config(struct i915_execbuffer *eb) >> +{ >> + struct i915_perf_stream *perf_stream; >> + int err; >> + >> + if (!eb->oa_config) >> + return 0; >> + >> + perf_stream = eb->perf_file->private_data; >> + >> + err = mutex_lock_interruptible(&perf_stream->config_mutex); >> + if (err) >> + return err; >> + >> + err = i915_active_request_set(&perf_stream->active_config_rq, >> + eb->request); >> + if (err) >> + goto out; >> + >> + /* >> + * If the config hasn't changed, skip reconfiguring the HW (this is >> + * subject to a delay we want to avoid has much as possible). >> + */ >> + if (eb->oa_config == perf_stream->oa_config) >> + goto out; >> + > We need to wait for any exclusive fences in case we migrate this object > in future: > > i915_vma_lock(eb->oa_vma); > err = i915_request_await_object(eb->request, eb->oa_vma->obj, false); /* await_resv already! */ > if (err = 0) > err = i915_vma_move_to_active(eb->oa_vma, eb->request, 0); > i915_vma_unlock(eb->oa_vma); > > Though this raises an interesting point, we do not actually want to emit > a semaphore here (albeit the engine is fixed atm) as we are after the > point where we have declared all semaphore waits completed. (Not an > issue to worry about right now!) If we do that for this VMA, don't we have to do it for any? If that's happening in a different function, I could put it there. > >> + if (err) >> + goto out; >> + >> + err = eb->engine->emit_bb_start(eb->request, >> + eb->oa_vma->node.start, >> + 0, I915_DISPATCH_SECURE); >> + if (err) >> + goto out; >> + >> + swap(eb->oa_config, perf_stream->oa_config); >> + >> +out: >> + mutex_unlock(&perf_stream->config_mutex); >> + >> + return err; >> +} >> + >> static int eb_submit(struct i915_execbuffer *eb) >> { >> int err; >> @@ -2077,6 +2168,10 @@ static int eb_submit(struct i915_execbuffer *eb) >> return err; >> } >> >> + err = eb_oa_config(eb); >> + if (err) >> + return err; > Ok, definitely needs to be after the waits! > >> + >> err = eb->engine->emit_bb_start(eb->request, >> eb->batch->node.start + >> eb->batch_start_offset, >> @@ -2643,8 +2738,25 @@ static int parse_timeline_fences(struct i915_user_extension __user *ext, void *d >> return 0; >> } >> >> +static int parse_perf_config(struct i915_user_extension __user *ext, void *data) >> +{ >> + struct i915_execbuffer *eb = data; >> + >> + if (eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) >> + return -EINVAL; >> + >> + if (copy_from_user(&eb->extensions.perf_config, ext, >> + sizeof(eb->extensions.perf_config))) >> + return -EFAULT; >> + >> + eb->extensions.flags |= BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF); >> + >> + return 0; >> +} >> + >> static const i915_user_extension_fn execbuf_extensions[] = { >> [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, >> + [DRM_I915_GEM_EXECBUFFER_EXT_PERF] = parse_perf_config, >> }; >> >> static int >> @@ -2755,10 +2867,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> } >> } >> >> - err = eb_create(&eb); >> + err = get_execbuf_oa_config(&eb); >> if (err) >> goto err_out_fence; >> >> + err = eb_create(&eb); >> + if (err) >> + goto err_oa_config; >> + >> GEM_BUG_ON(!eb.lut_size); >> >> err = eb_select_context(&eb); >> @@ -2769,6 +2885,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> if (unlikely(err)) >> goto err_context; >> >> + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { >> + struct i915_perf_stream *perf_stream = >> + eb.perf_file->private_data; >> + >> + if (perf_stream->engine != eb.engine) { >> + err = -EINVAL; >> + goto err_engine; >> + } >> + } >> + >> err = i915_mutex_lock_interruptible(dev); >> if (err) >> goto err_engine; >> @@ -2889,6 +3015,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> } >> } >> >> + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { >> + eb.oa_vma = i915_vma_instance(eb.oa_bo, >> + &eb.engine->i915->ggtt.vm, NULL); > eb.engine->gt->ggtt.vm Thanks! > > The i915_vma_instance() can be created outside of the struct_mutex, and > once we have the oa_vma, we don't need to keep a oa_bo pointer. > > Later we can do the i915_vma_pin() before struct_mutex as well. > > Thanks, that's looking a better with regard the plan to eliminate > struct_mutex. > -Chris >
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 46ad8d9642d1..0037bf8c2e6f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -24,6 +24,7 @@ #include "i915_gem_clflush.h" #include "i915_gem_context.h" #include "i915_gem_ioctls.h" +#include "i915_perf.h" #include "i915_trace.h" #include "i915_user_extensions.h" @@ -284,7 +285,13 @@ struct i915_execbuffer { struct { u64 flags; /** Available extensions parameters */ struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; + struct drm_i915_gem_execbuffer_ext_perf perf_config; } extensions; + + struct file *perf_file; + struct i915_oa_config *oa_config; /** HW configuration for OA, NULL is not needed. */ + struct drm_i915_gem_object *oa_bo; + struct i915_vma *oa_vma; }; #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) @@ -1152,6 +1159,46 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) return err; } + +static int +get_execbuf_oa_config(struct i915_execbuffer *eb) +{ + int err = 0; + + eb->perf_file = NULL; + eb->oa_config = NULL; + eb->oa_vma = NULL; + eb->oa_bo = NULL; + + if ((eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) == 0) + return 0; + + eb->perf_file = fget(eb->extensions.perf_config.perf_fd); + if (!eb->perf_file) + return -EINVAL; + + err = i915_mutex_lock_interruptible(&eb->i915->drm); + if (err) { + fput(eb->perf_file); + eb->perf_file = NULL; + return err; + } + + if (eb->perf_file->private_data != eb->i915->perf.exclusive_stream) + err = -EINVAL; + + mutex_unlock(&eb->i915->drm.struct_mutex); + + if (!err) { + err = i915_perf_get_oa_config_and_bo( + eb->i915->perf.exclusive_stream, + eb->extensions.perf_config.oa_config, + &eb->oa_config, &eb->oa_bo); + } + + return err; +} + static int __reloc_gpu_alloc(struct i915_execbuffer *eb, struct i915_vma *vma, unsigned int len) @@ -2051,6 +2098,50 @@ add_to_client(struct i915_request *rq, struct drm_file *file) spin_unlock(&file_priv->mm.lock); } +static int eb_oa_config(struct i915_execbuffer *eb) +{ + struct i915_perf_stream *perf_stream; + int err; + + if (!eb->oa_config) + return 0; + + perf_stream = eb->perf_file->private_data; + + err = mutex_lock_interruptible(&perf_stream->config_mutex); + if (err) + return err; + + err = i915_active_request_set(&perf_stream->active_config_rq, + eb->request); + if (err) + goto out; + + /* + * If the config hasn't changed, skip reconfiguring the HW (this is + * subject to a delay we want to avoid has much as possible). + */ + if (eb->oa_config == perf_stream->oa_config) + goto out; + + err = i915_vma_move_to_active(eb->oa_vma, eb->request, 0); + if (err) + goto out; + + err = eb->engine->emit_bb_start(eb->request, + eb->oa_vma->node.start, + 0, I915_DISPATCH_SECURE); + if (err) + goto out; + + swap(eb->oa_config, perf_stream->oa_config); + +out: + mutex_unlock(&perf_stream->config_mutex); + + return err; +} + static int eb_submit(struct i915_execbuffer *eb) { int err; @@ -2077,6 +2168,10 @@ static int eb_submit(struct i915_execbuffer *eb) return err; } + err = eb_oa_config(eb); + if (err) + return err; + err = eb->engine->emit_bb_start(eb->request, eb->batch->node.start + eb->batch_start_offset, @@ -2643,8 +2738,25 @@ static int parse_timeline_fences(struct i915_user_extension __user *ext, void *d return 0; } +static int parse_perf_config(struct i915_user_extension __user *ext, void *data) +{ + struct i915_execbuffer *eb = data; + + if (eb->extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) + return -EINVAL; + + if (copy_from_user(&eb->extensions.perf_config, ext, + sizeof(eb->extensions.perf_config))) + return -EFAULT; + + eb->extensions.flags |= BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF); + + return 0; +} + static const i915_user_extension_fn execbuf_extensions[] = { [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, + [DRM_I915_GEM_EXECBUFFER_EXT_PERF] = parse_perf_config, }; static int @@ -2755,10 +2867,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, } } - err = eb_create(&eb); + err = get_execbuf_oa_config(&eb); if (err) goto err_out_fence; + err = eb_create(&eb); + if (err) + goto err_oa_config; + GEM_BUG_ON(!eb.lut_size); err = eb_select_context(&eb); @@ -2769,6 +2885,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, if (unlikely(err)) goto err_context; + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { + struct i915_perf_stream *perf_stream = + eb.perf_file->private_data; + + if (perf_stream->engine != eb.engine) { + err = -EINVAL; + goto err_engine; + } + } + err = i915_mutex_lock_interruptible(dev); if (err) goto err_engine; @@ -2889,6 +3015,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, } } + if (eb.extensions.flags & BIT_ULL(DRM_I915_GEM_EXECBUFFER_EXT_PERF)) { + eb.oa_vma = i915_vma_instance(eb.oa_bo, + &eb.engine->i915->ggtt.vm, NULL); + if (unlikely(IS_ERR(eb.oa_vma))) { + err = PTR_ERR(eb.oa_vma); + eb.oa_vma = NULL; + goto err_request; + } + + err = i915_vma_pin(eb.oa_vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_request; + } + /* * Whilst this request exists, batch_obj will be on the * active_list, and so will hold the active reference. Only when this @@ -2935,6 +3075,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, i915_gem_context_put(eb.gem_context); err_destroy: eb_destroy(&eb); +err_oa_config: + if (eb.perf_file) + fput(eb.perf_file); + if (eb.oa_config) { + i915_gem_object_put(eb.oa_bo); + i915_oa_config_put(eb.oa_config); + } + if (eb.oa_vma) + i915_vma_unpin(eb.oa_vma); err_out_fence: if (out_fence_fd != -1) put_unused_fd(out_fence_fd); diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c index bd41cc5ce906..39d4c2c2e0f4 100644 --- a/drivers/gpu/drm/i915/i915_getparam.c +++ b/drivers/gpu/drm/i915/i915_getparam.c @@ -161,6 +161,10 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data, case I915_PARAM_PERF_REVISION: value = i915_perf_ioctl_version(); break; + case I915_PARAM_HAS_EXEC_PERF_CONFIG: + /* Obviously requires perf support. */ + value = i915->perf.initialized; + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index e98c9a7baa91..3166c9ca85f3 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -624,6 +624,16 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_PERF_REVISION 55 +/* + * Request an i915/perf performance configuration change before running the + * commands given in an execbuf. + * + * Performance configuration ID and the file descriptor of the i915 perf + * stream are given through drm_i915_gem_execbuffer_ext_perf. See + * I915_EXEC_EXT. + */ +#define I915_PARAM_HAS_EXEC_PERF_CONFIG 56 + /* Must be kept compact -- no holes and well documented */ typedef struct drm_i915_getparam { @@ -1026,6 +1036,12 @@ enum drm_i915_gem_execbuffer_ext { */ DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES = 1, + /** + * This identifier is associated with + * drm_i915_gem_execbuffer_perf_ext. + */ + DRM_I915_GEM_EXECBUFFER_EXT_PERF, + DRM_I915_GEM_EXECBUFFER_EXT_MAX /* non-ABI */ }; @@ -1056,6 +1072,29 @@ struct drm_i915_gem_execbuffer_ext_timeline_fences { __u64 values_ptr; }; +struct drm_i915_gem_execbuffer_ext_perf { + struct i915_user_extension base; + + /** + * Performance file descriptor returned by DRM_IOCTL_I915_PERF_OPEN. + * This is used to identify that the application requesting a HW + * performance configuration change actually has a right to do so + * because it also has access the i915-perf stream. + */ + __s32 perf_fd; + + /** + * Unused for now. Must be cleared to zero. + */ + __u32 pad; + + /** + * OA configuration ID to switch to before executing the commands + * associated to the execbuf. + */ + __u64 oa_config; +}; + struct drm_i915_gem_execbuffer2 { /** * List of gem_exec_object2 structs
We want the ability to dispatch a set of command buffer to the hardware, each with a different OA configuration. To achieve this, we reuse a couple of fields from the execbuf2 struct (I CAN HAZ execbuf3?) to notify what OA configuration should be used for a batch buffer. This requires the process making the execbuf with this flag to also own the perf fd at the time of execbuf. v2: Add a emit_oa_config() vfunc in the intel_engine_cs (Chris) Move oa_config vma to active (Chris) v3: Don't drop the lock for engine lookup (Chris) Move OA config vma to active before writing the ringbuffer (Chris) v4: Reuse i915_user_extension_fn Serialize requests with OA config updates v5: Check that the chained extension is only present once (Chris) Unpin oa_vma in main path (Chris) v6: Use BIT_ULL (Chris) v7: Hold drm.struct_mutex when serializing the request with OA config (Chris) v8: Remove active request from engine (Lionel) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 151 +++++++++++++++++- drivers/gpu/drm/i915/i915_getparam.c | 4 + include/uapi/drm/i915_drm.h | 39 +++++ 3 files changed, 193 insertions(+), 1 deletion(-)