Message ID | 1435742841-28454-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote: > Dave Gordon made the good suggestion that once the ringbuffers were > setup, the actual queuing of commands to program the initial GPU state > could be deferred. Since that initial state contains instructions for > setting up the first power context, we want to execute that as earlier > as possible, preferrably in the background to userspace. Then when > userspace does wake up, the first time it opens the device we just need > to flush the work to be sure that our commands are queued before any of > userspace's. (Hooking into the device open should mean we have to check > less often than say hooking into execbuffer.) > > Suggested-by: Dave Gordon <david.s.gordon@intel.com> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Dave Gordon <david.s.gordon@intel.com> Just before this gets a bit out of hand with various patches floating around ... I really meant it when I said that we should have a proper design discussion about this in Jesse's meeting first. Looking at all the ideas between you, Dave & me I count about 3-4 approaches to async gem init, and all have upsides and downsides. Aside from that I concur that if we do async gem init then it better be a worker and not relying on some abitrary userspace ioctl/syscall. Of course we'd still need to place proper synchronization points at a good place (flush_work in gem_open for Dave's design), but that's really orthogonal to running it in a worker imo. -Daniel > --- > drivers/gpu/drm/i915/i915_drv.h | 2 + > drivers/gpu/drm/i915/i915_gem.c | 113 +++++++++++++++++++++++++++------------- > 2 files changed, 79 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 3ea1fe8db63e..d4003dea97eb 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1938,6 +1938,8 @@ struct drm_i915_private { > > bool edp_low_vswing; > > + struct work_struct init_hw_late; > + > /* > * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch > * will be rejected. Instead look for a better place. > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 2f0fed1b9dd7..7efa71f8edd7 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -5140,12 +5140,76 @@ cleanup_render_ring: > return ret; > } > > +static int > +i915_gem_init_hw_late(struct drm_i915_private *dev_priv) > +{ > + struct intel_engine_cs *ring; > + int i, j; > + > + for_each_ring(ring, dev_priv, i) { > + struct drm_i915_gem_request *req; > + int ret; > + > + if (WARN_ON(!ring->default_context)) { > + ret = -ENODEV; > + goto err; > + } > + > + req = i915_gem_request_alloc(ring, ring->default_context); > + if (IS_ERR(req)) { > + ret = PTR_ERR(req); > + goto err; > + } > + > + if (ring->id == RCS) { > + for (j = 0; j < NUM_L3_SLICES(dev_priv); j++) > + i915_gem_l3_remap(req, j); > + } > + > + ret = i915_ppgtt_init_ring(req); > + if (ret) { > + DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret); > + goto err_req; > + } > + > + ret = i915_gem_context_enable(req); > + if (ret) { > + DRM_ERROR("Context enable ring #%d failed %d\n", i, ret); > + goto err_req; > + } > + > + i915_add_request_no_flush(req); > + continue; > + > +err_req: > + i915_gem_request_cancel(req); > +err: > + return ret; > + } > + > + return 0; > +} > + > +static void > +i915_gem_init_hw_worker(struct work_struct *work) > +{ > + struct drm_i915_private *dev_priv = > + container_of(work, typeof(*dev_priv), init_hw_late); > + mutex_lock(&dev_priv->dev->struct_mutex); > + if (i915_gem_init_hw_late(dev_priv)) { > + DRM_ERROR("Failed to initialize GPU, declaring it wedged\n"); > + atomic_set_mask(I915_WEDGED, > + &dev_priv->gpu_error.reset_counter); > + } > + mutex_unlock(&dev_priv->dev->struct_mutex); > +} > + > int > i915_gem_init_hw(struct drm_device *dev) > { > struct drm_i915_private *dev_priv = dev->dev_private; > struct intel_engine_cs *ring; > - int ret, i, j; > + int ret, i; > > if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt()) > return -EIO; > @@ -5198,41 +5262,10 @@ i915_gem_init_hw(struct drm_device *dev) > } > > /* Now it is safe to go back round and do everything else: */ > - for_each_ring(ring, dev_priv, i) { > - struct drm_i915_gem_request *req; > - > - WARN_ON(!ring->default_context); > - > - req = i915_gem_request_alloc(ring, ring->default_context); > - if (IS_ERR(req)) { > - ret = PTR_ERR(req); > - i915_gem_cleanup_ringbuffer(dev); > - goto out; > - } > - > - if (ring->id == RCS) { > - for (j = 0; j < NUM_L3_SLICES(dev); j++) > - i915_gem_l3_remap(req, j); > - } > - > - ret = i915_ppgtt_init_ring(req); > - if (ret && ret != -EIO) { > - DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret); > - i915_gem_request_cancel(req); > - i915_gem_cleanup_ringbuffer(dev); > - goto out; > - } > - > - ret = i915_gem_context_enable(req); > - if (ret && ret != -EIO) { > - DRM_ERROR("Context enable ring #%d failed %d\n", i, ret); > - i915_gem_request_cancel(req); > - i915_gem_cleanup_ringbuffer(dev); > - goto out; > - } > - > - i915_add_request_no_flush(req); > - } > + if (dev->open_count == 0) /* uncontested with userspace, i.e. boot */ > + queue_work(dev_priv->wq, &dev_priv->init_hw_late); > + else > + ret = i915_gem_init_hw_late(dev_priv); > > out: > intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); > @@ -5379,6 +5412,7 @@ i915_gem_load(struct drm_device *dev) > init_ring_lists(&dev_priv->ring[i]); > for (i = 0; i < I915_MAX_NUM_FENCES; i++) > INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); > + INIT_WORK(&dev_priv->init_hw_late, i915_gem_init_hw_worker); > INIT_DELAYED_WORK(&dev_priv->mm.retire_work, > i915_gem_retire_work_handler); > INIT_DELAYED_WORK(&dev_priv->mm.idle_work, > @@ -5442,11 +5476,18 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file) > > int i915_gem_open(struct drm_device *dev, struct drm_file *file) > { > + struct drm_i915_private *dev_priv = to_i915(dev); > struct drm_i915_file_private *file_priv; > int ret; > > DRM_DEBUG_DRIVER("\n"); > > + /* Flush ring initialisation before userspace can submit its own > + * batches, so the hardware initialisation commands are queued > + * first. > + */ > + flush_work(&dev_priv->init_hw_late); > + > file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL); > if (!file_priv) > return -ENOMEM; > -- > 2.1.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote: > On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote: > > Dave Gordon made the good suggestion that once the ringbuffers were > > setup, the actual queuing of commands to program the initial GPU state > > could be deferred. Since that initial state contains instructions for > > setting up the first power context, we want to execute that as earlier > > as possible, preferrably in the background to userspace. Then when > > userspace does wake up, the first time it opens the device we just need > > to flush the work to be sure that our commands are queued before any of > > userspace's. (Hooking into the device open should mean we have to check > > less often than say hooking into execbuffer.) > > > > Suggested-by: Dave Gordon <david.s.gordon@intel.com> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Dave Gordon <david.s.gordon@intel.com> > > Just before this gets a bit out of hand with various patches floating > around ... I really meant it when I said that we should have a proper > design discussion about this in Jesse's meeting first. What more is there to design? Asynchronously loading the submission port is orthogonal to the task of queuing requests for it, and need not block request construction (be it kernel or userspace). Dave just identified some work that we didn't need to do during module load. I don't think he would propose using it for loading guc firmware, that would just be silly... -Chris
On Wed, Jul 01, 2015 at 02:17:28PM +0100, Chris Wilson wrote: > On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote: > > On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote: > > > Dave Gordon made the good suggestion that once the ringbuffers were > > > setup, the actual queuing of commands to program the initial GPU state > > > could be deferred. Since that initial state contains instructions for > > > setting up the first power context, we want to execute that as earlier > > > as possible, preferrably in the background to userspace. Then when > > > userspace does wake up, the first time it opens the device we just need > > > to flush the work to be sure that our commands are queued before any of > > > userspace's. (Hooking into the device open should mean we have to check > > > less often than say hooking into execbuffer.) > > > > > > Suggested-by: Dave Gordon <david.s.gordon@intel.com> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Dave Gordon <david.s.gordon@intel.com> > > > > Just before this gets a bit out of hand with various patches floating > > around ... I really meant it when I said that we should have a proper > > design discussion about this in Jesse's meeting first. > > What more is there to design? Asynchronously loading the submission port > is orthogonal to the task of queuing requests for it, and need not block > request construction (be it kernel or userspace). Dave just identified > some work that we didn't need to do during module load. I don't think he > would propose using it for loading guc firmware, that would just be > silly... set_wedged in your patch doesn't have the wakeup to kick waiters. And maybe we want to be somewhat more synchronous with with init fail than gpu hangs, for userspace to make better decisions. Also we still have that issue that sometimes an -EIO escapes into modeset code. And yes this is mean to provide the async init for the request firmware. -Daniel
On Wed, Jul 01, 2015 at 04:07:08PM +0200, Daniel Vetter wrote: > On Wed, Jul 01, 2015 at 02:17:28PM +0100, Chris Wilson wrote: > > On Wed, Jul 01, 2015 at 03:07:18PM +0200, Daniel Vetter wrote: > > > On Wed, Jul 01, 2015 at 10:27:21AM +0100, Chris Wilson wrote: > > > > Dave Gordon made the good suggestion that once the ringbuffers were > > > > setup, the actual queuing of commands to program the initial GPU state > > > > could be deferred. Since that initial state contains instructions for > > > > setting up the first power context, we want to execute that as earlier > > > > as possible, preferrably in the background to userspace. Then when > > > > userspace does wake up, the first time it opens the device we just need > > > > to flush the work to be sure that our commands are queued before any of > > > > userspace's. (Hooking into the device open should mean we have to check > > > > less often than say hooking into execbuffer.) > > > > > > > > Suggested-by: Dave Gordon <david.s.gordon@intel.com> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > > Cc: Dave Gordon <david.s.gordon@intel.com> > > > > > > Just before this gets a bit out of hand with various patches floating > > > around ... I really meant it when I said that we should have a proper > > > design discussion about this in Jesse's meeting first. > > > > What more is there to design? Asynchronously loading the submission port > > is orthogonal to the task of queuing requests for it, and need not block > > request construction (be it kernel or userspace). Dave just identified > > some work that we didn't need to do during module load. I don't think he > > would propose using it for loading guc firmware, that would just be > > silly... > > set_wedged in your patch doesn't have the wakeup to kick waiters. True. But is has to be impossible for a waiter to exist at this point, or else the entire async GPU init is broken. These commands have to be the first requests we send to the GPU. Everything else must wait before it is allowed to start queuing. > And > maybe we want to be somewhat more synchronous with with init fail than gpu > hangs, for userspace to make better decisions. The init is still synchronous with userspace using the device, just (and this is no change) the only communication with userspace that GEM initialisation failed is the wedged GPU. > Also we still have that > issue that sometimes an -EIO escapes into modeset code. But there are no new wait requests running conncurrent with GEM init, so this patch doesn't alter that. > And yes this is > mean to provide the async init for the request firmware. It is an inappropriate juncture for async request firmware. I can keep repeating that enabling the submission ports is orthogonal to setting up the CS engines and allowing requests to be queued, because it is... There is no need to modify the higher levels for async GuC initialisation. The serialisation there is when to start feeding requests into the submission port. At the moment we do that immediately when it is idle - but the GuC is not idle until it loaded, as soon as it is loaded it can simply feed in the first set of requests and start on its merry way. This also allows GuC failure also always to transparently fallback to execlists. -Chris
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 3ea1fe8db63e..d4003dea97eb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1938,6 +1938,8 @@ struct drm_i915_private { bool edp_low_vswing; + struct work_struct init_hw_late; + /* * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch * will be rejected. Instead look for a better place. diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 2f0fed1b9dd7..7efa71f8edd7 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -5140,12 +5140,76 @@ cleanup_render_ring: return ret; } +static int +i915_gem_init_hw_late(struct drm_i915_private *dev_priv) +{ + struct intel_engine_cs *ring; + int i, j; + + for_each_ring(ring, dev_priv, i) { + struct drm_i915_gem_request *req; + int ret; + + if (WARN_ON(!ring->default_context)) { + ret = -ENODEV; + goto err; + } + + req = i915_gem_request_alloc(ring, ring->default_context); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto err; + } + + if (ring->id == RCS) { + for (j = 0; j < NUM_L3_SLICES(dev_priv); j++) + i915_gem_l3_remap(req, j); + } + + ret = i915_ppgtt_init_ring(req); + if (ret) { + DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret); + goto err_req; + } + + ret = i915_gem_context_enable(req); + if (ret) { + DRM_ERROR("Context enable ring #%d failed %d\n", i, ret); + goto err_req; + } + + i915_add_request_no_flush(req); + continue; + +err_req: + i915_gem_request_cancel(req); +err: + return ret; + } + + return 0; +} + +static void +i915_gem_init_hw_worker(struct work_struct *work) +{ + struct drm_i915_private *dev_priv = + container_of(work, typeof(*dev_priv), init_hw_late); + mutex_lock(&dev_priv->dev->struct_mutex); + if (i915_gem_init_hw_late(dev_priv)) { + DRM_ERROR("Failed to initialize GPU, declaring it wedged\n"); + atomic_set_mask(I915_WEDGED, + &dev_priv->gpu_error.reset_counter); + } + mutex_unlock(&dev_priv->dev->struct_mutex); +} + int i915_gem_init_hw(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_engine_cs *ring; - int ret, i, j; + int ret, i; if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt()) return -EIO; @@ -5198,41 +5262,10 @@ i915_gem_init_hw(struct drm_device *dev) } /* Now it is safe to go back round and do everything else: */ - for_each_ring(ring, dev_priv, i) { - struct drm_i915_gem_request *req; - - WARN_ON(!ring->default_context); - - req = i915_gem_request_alloc(ring, ring->default_context); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - i915_gem_cleanup_ringbuffer(dev); - goto out; - } - - if (ring->id == RCS) { - for (j = 0; j < NUM_L3_SLICES(dev); j++) - i915_gem_l3_remap(req, j); - } - - ret = i915_ppgtt_init_ring(req); - if (ret && ret != -EIO) { - DRM_ERROR("PPGTT enable ring #%d failed %d\n", i, ret); - i915_gem_request_cancel(req); - i915_gem_cleanup_ringbuffer(dev); - goto out; - } - - ret = i915_gem_context_enable(req); - if (ret && ret != -EIO) { - DRM_ERROR("Context enable ring #%d failed %d\n", i, ret); - i915_gem_request_cancel(req); - i915_gem_cleanup_ringbuffer(dev); - goto out; - } - - i915_add_request_no_flush(req); - } + if (dev->open_count == 0) /* uncontested with userspace, i.e. boot */ + queue_work(dev_priv->wq, &dev_priv->init_hw_late); + else + ret = i915_gem_init_hw_late(dev_priv); out: intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); @@ -5379,6 +5412,7 @@ i915_gem_load(struct drm_device *dev) init_ring_lists(&dev_priv->ring[i]); for (i = 0; i < I915_MAX_NUM_FENCES; i++) INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); + INIT_WORK(&dev_priv->init_hw_late, i915_gem_init_hw_worker); INIT_DELAYED_WORK(&dev_priv->mm.retire_work, i915_gem_retire_work_handler); INIT_DELAYED_WORK(&dev_priv->mm.idle_work, @@ -5442,11 +5476,18 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file) int i915_gem_open(struct drm_device *dev, struct drm_file *file) { + struct drm_i915_private *dev_priv = to_i915(dev); struct drm_i915_file_private *file_priv; int ret; DRM_DEBUG_DRIVER("\n"); + /* Flush ring initialisation before userspace can submit its own + * batches, so the hardware initialisation commands are queued + * first. + */ + flush_work(&dev_priv->init_hw_late); + file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL); if (!file_priv) return -ENOMEM;
Dave Gordon made the good suggestion that once the ringbuffers were setup, the actual queuing of commands to program the initial GPU state could be deferred. Since that initial state contains instructions for setting up the first power context, we want to execute that as earlier as possible, preferrably in the background to userspace. Then when userspace does wake up, the first time it opens the device we just need to flush the work to be sure that our commands are queued before any of userspace's. (Hooking into the device open should mean we have to check less often than say hooking into execbuffer.) Suggested-by: Dave Gordon <david.s.gordon@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Dave Gordon <david.s.gordon@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem.c | 113 +++++++++++++++++++++++++++------------- 2 files changed, 79 insertions(+), 36 deletions(-)