diff mbox

[9/9] drm/i915/cmdparser: Accelerate copies from WC memory

Message ID 1471014450-21020-10-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Aug. 12, 2016, 3:07 p.m. UTC
If we need to use clflush to prepare our batch for reads from memory, we
can bypass the cache instead by using non-temporal copies.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 58 ++++++++++++++++++++++------------
 drivers/gpu/drm/i915/i915_debugfs.c    | 24 --------------
 drivers/gpu/drm/i915/i915_drv.c        | 19 -----------
 drivers/gpu/drm/i915/i915_gem.c        | 48 ++++++++++++++++------------
 drivers/gpu/drm/i915/i915_gem_gtt.c    | 17 +++++++---
 drivers/gpu/drm/i915/i915_gem_tiling.c |  4 ---
 drivers/gpu/drm/i915/i915_irq.c        |  2 --
 drivers/gpu/drm/i915/intel_uncore.c    |  6 ++--
 8 files changed, 81 insertions(+), 97 deletions(-)

Comments

Chris Wilson Aug. 12, 2016, 3:14 p.m. UTC | #1
On Fri, Aug 12, 2016 at 04:07:30PM +0100, Chris Wilson wrote:
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2fe88d930ca7..8dcdc27afe80 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -715,18 +715,13 @@ static int i915_gem_seqno_info(struct seq_file *m, void *data)
>  	struct drm_device *dev = node->minor->dev;
>  	struct drm_i915_private *dev_priv = to_i915(dev);
>  	struct intel_engine_cs *engine;
> -	int ret;
>  
> -	ret = mutex_lock_interruptible(&dev->struct_mutex);
> -	if (ret)
> -		return ret;
>  	intel_runtime_pm_get(dev_priv);
>  
>  	for_each_engine(engine, dev_priv)
>  		i915_ring_seqno_info(m, engine);
>  
>  	intel_runtime_pm_put(dev_priv);
> -	mutex_unlock(&dev->struct_mutex);
....
On noes, rebase damage. /o\
-Chris
Matthew Auld Aug. 17, 2016, 4:33 p.m. UTC | #2
On 12 August 2016 at 16:07, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> If we need to use clflush to prepare our batch for reads from memory, we
> can bypass the cache instead by using non-temporal copies.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 58 ++++++++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_debugfs.c    | 24 --------------
>  drivers/gpu/drm/i915/i915_drv.c        | 19 -----------
>  drivers/gpu/drm/i915/i915_gem.c        | 48 ++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c    | 17 +++++++---
>  drivers/gpu/drm/i915/i915_gem_tiling.c |  4 ---
>  drivers/gpu/drm/i915/i915_irq.c        |  2 --
>  drivers/gpu/drm/i915/intel_uncore.c    |  6 ++--
>  8 files changed, 81 insertions(+), 97 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index cea3ef7299cc..3244ef1401ad 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -969,8 +969,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
>  {
>         unsigned int src_needs_clflush;
>         unsigned int dst_needs_clflush;
> -       void *dst, *ptr;
> -       int offset, n;
> +       void *dst;
>         int ret;
>
>         ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
> @@ -987,24 +986,43 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
>         if (IS_ERR(dst))
>                 goto unpin_dst;
>
> -       ptr = dst;
> -       offset = offset_in_page(batch_start_offset);
> -       if (dst_needs_clflush & CLFLUSH_BEFORE)
> -               batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
> -
> -       for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
> -               int len = min_t(int, batch_len, PAGE_SIZE - offset);
> -               void *vaddr;
> -
> -               vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
> -               if (src_needs_clflush)
> -                       drm_clflush_virt_range(vaddr + offset, len);
> -               memcpy(ptr, vaddr + offset, len);
> -               kunmap_atomic(vaddr);
> -
> -               ptr += len;
> -               batch_len -= len;
> -               offset = 0;
> +       if (src_needs_clflush &&
> +           i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) {
> +               void *src;
> +
> +               src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
> +               if (IS_ERR(src))
> +                       goto shmem_copy;
> +
> +               i915_memcpy_from_wc(dst,
> +                                   src + batch_start_offset,
> +                                   ALIGN(batch_len, 16));
> +               i915_gem_object_unpin_map(src_obj);
> +       } else {
> +               void *ptr;
> +               int offset, n;
> +
> +shmem_copy:
I think Joonas may shed another tear at the sight of this :)

> +               offset = offset_in_page(batch_start_offset);
> +               if (dst_needs_clflush & CLFLUSH_BEFORE)
> +                       batch_len = roundup(batch_len,
> +                                           boot_cpu_data.x86_clflush_size);
> +
> +               ptr = dst;
> +               for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
> +                       int len = min_t(int, batch_len, PAGE_SIZE - offset);
> +                       void *vaddr;
> +
> +                       vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
> +                       if (src_needs_clflush)
> +                               drm_clflush_virt_range(vaddr + offset, len);
> +                       memcpy(ptr, vaddr + offset, len);
> +                       kunmap_atomic(vaddr);
> +
> +                       ptr += len;
> +                       batch_len -= len;
> +                       offset = 0;
> +               }
>         }
>

Disregarding the rest, which seems unrelated to this patch.

Reviewed-by: Matthew Auld <matthew.auld@intel.com>

>         /* dst_obj is returned with vmap pinned */
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2fe88d930ca7..8dcdc27afe80 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -715,18 +715,13 @@ static int i915_gem_seqno_info(struct seq_file *m, void *data)
>         struct drm_device *dev = node->minor->dev;
>         struct drm_i915_private *dev_priv = to_i915(dev);
>         struct intel_engine_cs *engine;
> -       int ret;
>
> -       ret = mutex_lock_interruptible(&dev->struct_mutex);
> -       if (ret)
> -               return ret;
>         intel_runtime_pm_get(dev_priv);
>
>         for_each_engine(engine, dev_priv)
>                 i915_ring_seqno_info(m, engine);
>
>         intel_runtime_pm_put(dev_priv);
> -       mutex_unlock(&dev->struct_mutex);
>
>         return 0;
>  }
> @@ -1379,11 +1374,7 @@ static int ironlake_drpc_info(struct seq_file *m)
>         struct drm_i915_private *dev_priv = to_i915(dev);
>         u32 rgvmodectl, rstdbyctl;
>         u16 crstandvid;
> -       int ret;
>
> -       ret = mutex_lock_interruptible(&dev->struct_mutex);
> -       if (ret)
> -               return ret;
>         intel_runtime_pm_get(dev_priv);
>
>         rgvmodectl = I915_READ(MEMMODECTL);
> @@ -1391,7 +1382,6 @@ static int ironlake_drpc_info(struct seq_file *m)
>         crstandvid = I915_READ16(CRSTANDVID);
>
>         intel_runtime_pm_put(dev_priv);
> -       mutex_unlock(&dev->struct_mutex);
>
>         seq_printf(m, "HD boost: %s\n", yesno(rgvmodectl & MEMMODE_BOOST_EN));
>         seq_printf(m, "Boost freq: %d\n",
> @@ -2179,11 +2169,7 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
>         struct drm_info_node *node = m->private;
>         struct drm_device *dev = node->minor->dev;
>         struct drm_i915_private *dev_priv = to_i915(dev);
> -       int ret;
>
> -       ret = mutex_lock_interruptible(&dev->struct_mutex);
> -       if (ret)
> -               return ret;
>         intel_runtime_pm_get(dev_priv);
>
>         seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
> @@ -2223,7 +2209,6 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
>                 seq_puts(m, "L-shaped memory detected\n");
>
>         intel_runtime_pm_put(dev_priv);
> -       mutex_unlock(&dev->struct_mutex);
>
>         return 0;
>  }
> @@ -4729,13 +4714,9 @@ i915_wedged_set(void *data, u64 val)
>         if (i915_reset_in_progress(&dev_priv->gpu_error))
>                 return -EAGAIN;
>
> -       intel_runtime_pm_get(dev_priv);
> -
>         i915_handle_error(dev_priv, val,
>                           "Manually setting wedged to %llu", val);
>
> -       intel_runtime_pm_put(dev_priv);
> -
>         return 0;
>  }
>
> @@ -4976,20 +4957,15 @@ i915_cache_sharing_get(void *data, u64 *val)
>         struct drm_device *dev = data;
>         struct drm_i915_private *dev_priv = to_i915(dev);
>         u32 snpcr;
> -       int ret;
>
>         if (!(IS_GEN6(dev) || IS_GEN7(dev)))
>                 return -ENODEV;
>
> -       ret = mutex_lock_interruptible(&dev->struct_mutex);
> -       if (ret)
> -               return ret;
>         intel_runtime_pm_get(dev_priv);
>
>         snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
>
>         intel_runtime_pm_put(dev_priv);
> -       mutex_unlock(&dev_priv->drm.struct_mutex);
>
>         *val = (snpcr & GEN6_MBC_SNPCR_MASK) >> GEN6_MBC_SNPCR_SHIFT;
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c040c6329804..b458faa0d349 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -2293,24 +2293,6 @@ static int intel_runtime_suspend(struct device *device)
>
>         DRM_DEBUG_KMS("Suspending device\n");
>
> -       /*
> -        * We could deadlock here in case another thread holding struct_mutex
> -        * calls RPM suspend concurrently, since the RPM suspend will wait
> -        * first for this RPM suspend to finish. In this case the concurrent
> -        * RPM resume will be followed by its RPM suspend counterpart. Still
> -        * for consistency return -EAGAIN, which will reschedule this suspend.
> -        */
> -       if (!mutex_trylock(&dev->struct_mutex)) {
> -               DRM_DEBUG_KMS("device lock contention, deffering suspend\n");
> -               /*
> -                * Bump the expiration timestamp, otherwise the suspend won't
> -                * be rescheduled.
> -                */
> -               pm_runtime_mark_last_busy(device);
> -
> -               return -EAGAIN;
> -       }
> -
>         disable_rpm_wakeref_asserts(dev_priv);
>
>         /*
> @@ -2318,7 +2300,6 @@ static int intel_runtime_suspend(struct device *device)
>          * an RPM reference.
>          */
>         i915_gem_release_all_mmaps(dev_priv);
> -       mutex_unlock(&dev->struct_mutex);
>
>         intel_guc_suspend(dev);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5c1acfc10bc4..a26bfd7d6aab 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1434,11 +1434,9 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>         if (ret)
>                 goto err;
>
> -       intel_runtime_pm_get(dev_priv);
> -
>         ret = i915_mutex_lock_interruptible(dev);
>         if (ret)
> -               goto err_rpm;
> +               goto err;
>
>         ret = -EFAULT;
>         /* We can only do the GTT pwrite on untiled buffers, as otherwise
> @@ -1449,7 +1447,9 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>          */
>         if (!i915_gem_object_has_struct_page(obj) ||
>             cpu_write_needs_clflush(obj)) {
> +               intel_runtime_pm_get(dev_priv);
>                 ret = i915_gem_gtt_pwrite_fast(dev_priv, obj, args, file);
> +               intel_runtime_pm_put(dev_priv);
>                 /* Note that the gtt paths might fail with non-page-backed user
>                  * pointers (e.g. gtt mappings when moving data between
>                  * textures). Fallback to the shmem path in that case. */
> @@ -1464,12 +1464,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>
>         i915_gem_object_put(obj);
>         mutex_unlock(&dev->struct_mutex);
> -       intel_runtime_pm_put(dev_priv);
> -
>         return ret;
>
> -err_rpm:
> -       intel_runtime_pm_put(dev_priv);
>  err:
>         i915_gem_object_put_unlocked(obj);
>         return ret;
> @@ -1833,9 +1829,13 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
>         /* Serialisation between user GTT access and our code depends upon
>          * revoking the CPU's PTE whilst the mutex is held. The next user
>          * pagefault then has to wait until we release the mutex.
> +        *
> +        * Note that RPM complicates somewhat by adding an additional
> +        * requirement that operations to the GGTT be made holding the RPM
> +        * wakeref. This in turns allow us to release the mmap from within
> +        * the RPM suspend code ignoring the struct_mutex serialisation in
> +        * lieu of the RPM barriers.
>          */
> -       lockdep_assert_held(&obj->base.dev->struct_mutex);
> -
>         if (!obj->fault_mappable)
>                 return;
>
> @@ -1854,11 +1854,21 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
>         obj->fault_mappable = false;
>  }
>
> +static void assert_rpm_release_all_mmaps(struct drm_i915_private *dev_priv)
> +{
> +       assert_rpm_wakelock_held(dev_priv);
> +}
> +
>  void
>  i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv)
>  {
>         struct drm_i915_gem_object *obj;
>
> +       /* This should only be called by RPM as we require the bound_list
> +        * to be protected by the RPM barriers and not struct_mutex.
> +        * We check that we are holding the wakeref whenever we manipulate
> +        * the dev_priv->mm.bound_list (via assert_rpm_release_all_mmaps).
> +        */
>         list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
>                 i915_gem_release_mmap(obj);
>  }
> @@ -2402,9 +2412,11 @@ i915_gem_object_retire__read(struct i915_gem_active *active,
>          * so that we don't steal from recently used but inactive objects
>          * (unless we are forced to ofc!)
>          */
> -       if (obj->bind_count)
> +       if (obj->bind_count) {
> +               assert_rpm_release_all_mmaps(request->i915);
>                 list_move_tail(&obj->global_list,
>                                &request->i915->mm.bound_list);
> +       }
>
>         if (i915_gem_object_has_active_reference(obj)) {
>                 i915_gem_object_clear_active_reference(obj);
> @@ -2881,9 +2893,11 @@ int i915_vma_unbind(struct i915_vma *vma)
>
>         /* Since the unbound list is global, only move to that list if
>          * no more VMAs exist. */
> -       if (--obj->bind_count == 0)
> +       if (--obj->bind_count == 0) {
> +               assert_rpm_release_all_mmaps(to_i915(obj->base.dev));
>                 list_move_tail(&obj->global_list,
>                                &to_i915(obj->base.dev)->mm.unbound_list);
> +       }
>
>         /* And finally now the object is completely decoupled from this vma,
>          * we can drop its hold on the backing storage and allow it to be
> @@ -3071,6 +3085,7 @@ search_free:
>         }
>         GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, obj->cache_level));
>
> +       assert_rpm_release_all_mmaps(dev_priv);
>         list_move_tail(&obj->global_list, &dev_priv->mm.bound_list);
>         list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
>         obj->bind_count++;
> @@ -3420,7 +3435,6 @@ int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
>  int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
>                                struct drm_file *file)
>  {
> -       struct drm_i915_private *dev_priv = to_i915(dev);
>         struct drm_i915_gem_caching *args = data;
>         struct drm_i915_gem_object *obj;
>         enum i915_cache_level level;
> @@ -3449,11 +3463,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
>                 return -EINVAL;
>         }
>
> -       intel_runtime_pm_get(dev_priv);
> -
>         ret = i915_mutex_lock_interruptible(dev);
>         if (ret)
> -               goto rpm_put;
> +               return ret;
>
>         obj = i915_gem_object_lookup(file, args->handle);
>         if (!obj) {
> @@ -3462,13 +3474,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
>         }
>
>         ret = i915_gem_object_set_cache_level(obj, level);
> -
>         i915_gem_object_put(obj);
>  unlock:
>         mutex_unlock(&dev->struct_mutex);
> -rpm_put:
> -       intel_runtime_pm_put(dev_priv);
> -
>         return ret;
>  }
>
> @@ -4174,8 +4182,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>
>         kfree(obj->bit_17);
>         i915_gem_object_free(obj);
> -
> -       intel_runtime_pm_put(dev_priv);
>  }
>
>  void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index fe7f9887ee67..67a3ff960b0d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2594,6 +2594,7 @@ static int ggtt_bind_vma(struct i915_vma *vma,
>                          enum i915_cache_level cache_level,
>                          u32 flags)
>  {
> +       struct drm_i915_private *i915 = to_i915(vma->vm->dev);
>         struct drm_i915_gem_object *obj = vma->obj;
>         u32 pte_flags = 0;
>         int ret;
> @@ -2606,8 +2607,10 @@ static int ggtt_bind_vma(struct i915_vma *vma,
>         if (obj->gt_ro)
>                 pte_flags |= PTE_READ_ONLY;
>
> +       intel_runtime_pm_get(i915);
>         vma->vm->insert_entries(vma->vm, vma->pages, vma->node.start,
>                                 cache_level, pte_flags);
> +       intel_runtime_pm_get(i915);
>
>         /*
>          * Without aliasing PPGTT there's no difference between
> @@ -2623,6 +2626,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
>                                  enum i915_cache_level cache_level,
>                                  u32 flags)
>  {
> +       struct drm_i915_private *i915 = to_i915(vma->vm->dev);
>         u32 pte_flags;
>         int ret;
>
> @@ -2637,14 +2641,15 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
>
>
>         if (flags & I915_VMA_GLOBAL_BIND) {
> +               intel_runtime_pm_get(i915);
>                 vma->vm->insert_entries(vma->vm,
>                                         vma->pages, vma->node.start,
>                                         cache_level, pte_flags);
> +               intel_runtime_pm_put(i915);
>         }
>
>         if (flags & I915_VMA_LOCAL_BIND) {
> -               struct i915_hw_ppgtt *appgtt =
> -                       to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
> +               struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
>                 appgtt->base.insert_entries(&appgtt->base,
>                                             vma->pages, vma->node.start,
>                                             cache_level, pte_flags);
> @@ -2655,13 +2660,17 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
>
>  static void ggtt_unbind_vma(struct i915_vma *vma)
>  {
> -       struct i915_hw_ppgtt *appgtt = to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
> +       struct drm_i915_private *i915 = to_i915(vma->vm->dev);
> +       struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
>         const u64 size = min(vma->size, vma->node.size);
>
> -       if (vma->flags & I915_VMA_GLOBAL_BIND)
> +       if (vma->flags & I915_VMA_GLOBAL_BIND) {
> +               intel_runtime_pm_get(i915);
>                 vma->vm->clear_range(vma->vm,
>                                      vma->node.start, size,
>                                      true);
> +               intel_runtime_pm_put(i915);
> +       }
>
>         if (vma->flags & I915_VMA_LOCAL_BIND && appgtt)
>                 appgtt->base.clear_range(&appgtt->base,
> diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
> index a14b1e3d4c78..08f796a4f5f6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_tiling.c
> +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
> @@ -204,8 +204,6 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
>                 return -EINVAL;
>         }
>
> -       intel_runtime_pm_get(dev_priv);
> -
>         mutex_lock(&dev->struct_mutex);
>         if (obj->pin_display || obj->framebuffer_references) {
>                 err = -EBUSY;
> @@ -301,8 +299,6 @@ err:
>         i915_gem_object_put(obj);
>         mutex_unlock(&dev->struct_mutex);
>
> -       intel_runtime_pm_put(dev_priv);
> -
>         return err;
>  }
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ebb83d5a448b..3d9c2a21dfbd 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2523,7 +2523,6 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>                  * simulated reset via debugs, so get an RPM reference.
>                  */
>                 intel_runtime_pm_get(dev_priv);
> -
>                 intel_prepare_reset(dev_priv);
>
>                 /*
> @@ -2535,7 +2534,6 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>                 ret = i915_reset(dev_priv);
>
>                 intel_finish_reset(dev_priv);
> -
>                 intel_runtime_pm_put(dev_priv);
>
>                 if (ret == 0)
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 43f833901b8e..a6b04da4bf21 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1414,7 +1414,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>         struct register_whitelist const *entry = whitelist;
>         unsigned size;
>         i915_reg_t offset_ldw, offset_udw;
> -       int i, ret = 0;
> +       int i, ret;
>
>         for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {
>                 if (i915_mmio_reg_offset(entry->offset_ldw) == (reg->offset & -entry->size) &&
> @@ -1436,6 +1436,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>
>         intel_runtime_pm_get(dev_priv);
>
> +       ret = 0;
>         switch (size) {
>         case 8 | 1:
>                 reg->val = I915_READ64_2x32(offset_ldw, offset_udw);
> @@ -1454,10 +1455,9 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>                 break;
>         default:
>                 ret = -EINVAL;
> -               goto out;
> +               break;
>         }
>
> -out:
>         intel_runtime_pm_put(dev_priv);
>         return ret;
>  }
> --
> 2.8.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index cea3ef7299cc..3244ef1401ad 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -969,8 +969,7 @@  static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 {
 	unsigned int src_needs_clflush;
 	unsigned int dst_needs_clflush;
-	void *dst, *ptr;
-	int offset, n;
+	void *dst;
 	int ret;
 
 	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
@@ -987,24 +986,43 @@  static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 	if (IS_ERR(dst))
 		goto unpin_dst;
 
-	ptr = dst;
-	offset = offset_in_page(batch_start_offset);
-	if (dst_needs_clflush & CLFLUSH_BEFORE)
-		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
-
-	for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
-		int len = min_t(int, batch_len, PAGE_SIZE - offset);
-		void *vaddr;
-
-		vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
-		if (src_needs_clflush)
-			drm_clflush_virt_range(vaddr + offset, len);
-		memcpy(ptr, vaddr + offset, len);
-		kunmap_atomic(vaddr);
-
-		ptr += len;
-		batch_len -= len;
-		offset = 0;
+	if (src_needs_clflush &&
+	    i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) {
+		void *src;
+
+		src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
+		if (IS_ERR(src))
+			goto shmem_copy;
+
+		i915_memcpy_from_wc(dst,
+				    src + batch_start_offset,
+				    ALIGN(batch_len, 16));
+		i915_gem_object_unpin_map(src_obj);
+	} else {
+		void *ptr;
+		int offset, n;
+
+shmem_copy:
+		offset = offset_in_page(batch_start_offset);
+		if (dst_needs_clflush & CLFLUSH_BEFORE)
+			batch_len = roundup(batch_len,
+					    boot_cpu_data.x86_clflush_size);
+
+		ptr = dst;
+		for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
+			int len = min_t(int, batch_len, PAGE_SIZE - offset);
+			void *vaddr;
+
+			vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
+			if (src_needs_clflush)
+				drm_clflush_virt_range(vaddr + offset, len);
+			memcpy(ptr, vaddr + offset, len);
+			kunmap_atomic(vaddr);
+
+			ptr += len;
+			batch_len -= len;
+			offset = 0;
+		}
 	}
 
 	/* dst_obj is returned with vmap pinned */
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2fe88d930ca7..8dcdc27afe80 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -715,18 +715,13 @@  static int i915_gem_seqno_info(struct seq_file *m, void *data)
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_engine_cs *engine;
-	int ret;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
 	intel_runtime_pm_get(dev_priv);
 
 	for_each_engine(engine, dev_priv)
 		i915_ring_seqno_info(m, engine);
 
 	intel_runtime_pm_put(dev_priv);
-	mutex_unlock(&dev->struct_mutex);
 
 	return 0;
 }
@@ -1379,11 +1374,7 @@  static int ironlake_drpc_info(struct seq_file *m)
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	u32 rgvmodectl, rstdbyctl;
 	u16 crstandvid;
-	int ret;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
 	intel_runtime_pm_get(dev_priv);
 
 	rgvmodectl = I915_READ(MEMMODECTL);
@@ -1391,7 +1382,6 @@  static int ironlake_drpc_info(struct seq_file *m)
 	crstandvid = I915_READ16(CRSTANDVID);
 
 	intel_runtime_pm_put(dev_priv);
-	mutex_unlock(&dev->struct_mutex);
 
 	seq_printf(m, "HD boost: %s\n", yesno(rgvmodectl & MEMMODE_BOOST_EN));
 	seq_printf(m, "Boost freq: %d\n",
@@ -2179,11 +2169,7 @@  static int i915_swizzle_info(struct seq_file *m, void *data)
 	struct drm_info_node *node = m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
 	intel_runtime_pm_get(dev_priv);
 
 	seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
@@ -2223,7 +2209,6 @@  static int i915_swizzle_info(struct seq_file *m, void *data)
 		seq_puts(m, "L-shaped memory detected\n");
 
 	intel_runtime_pm_put(dev_priv);
-	mutex_unlock(&dev->struct_mutex);
 
 	return 0;
 }
@@ -4729,13 +4714,9 @@  i915_wedged_set(void *data, u64 val)
 	if (i915_reset_in_progress(&dev_priv->gpu_error))
 		return -EAGAIN;
 
-	intel_runtime_pm_get(dev_priv);
-
 	i915_handle_error(dev_priv, val,
 			  "Manually setting wedged to %llu", val);
 
-	intel_runtime_pm_put(dev_priv);
-
 	return 0;
 }
 
@@ -4976,20 +4957,15 @@  i915_cache_sharing_get(void *data, u64 *val)
 	struct drm_device *dev = data;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	u32 snpcr;
-	int ret;
 
 	if (!(IS_GEN6(dev) || IS_GEN7(dev)))
 		return -ENODEV;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
 	intel_runtime_pm_get(dev_priv);
 
 	snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
 
 	intel_runtime_pm_put(dev_priv);
-	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	*val = (snpcr & GEN6_MBC_SNPCR_MASK) >> GEN6_MBC_SNPCR_SHIFT;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c040c6329804..b458faa0d349 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2293,24 +2293,6 @@  static int intel_runtime_suspend(struct device *device)
 
 	DRM_DEBUG_KMS("Suspending device\n");
 
-	/*
-	 * We could deadlock here in case another thread holding struct_mutex
-	 * calls RPM suspend concurrently, since the RPM suspend will wait
-	 * first for this RPM suspend to finish. In this case the concurrent
-	 * RPM resume will be followed by its RPM suspend counterpart. Still
-	 * for consistency return -EAGAIN, which will reschedule this suspend.
-	 */
-	if (!mutex_trylock(&dev->struct_mutex)) {
-		DRM_DEBUG_KMS("device lock contention, deffering suspend\n");
-		/*
-		 * Bump the expiration timestamp, otherwise the suspend won't
-		 * be rescheduled.
-		 */
-		pm_runtime_mark_last_busy(device);
-
-		return -EAGAIN;
-	}
-
 	disable_rpm_wakeref_asserts(dev_priv);
 
 	/*
@@ -2318,7 +2300,6 @@  static int intel_runtime_suspend(struct device *device)
 	 * an RPM reference.
 	 */
 	i915_gem_release_all_mmaps(dev_priv);
-	mutex_unlock(&dev->struct_mutex);
 
 	intel_guc_suspend(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5c1acfc10bc4..a26bfd7d6aab 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1434,11 +1434,9 @@  i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
-	intel_runtime_pm_get(dev_priv);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		goto err_rpm;
+		goto err;
 
 	ret = -EFAULT;
 	/* We can only do the GTT pwrite on untiled buffers, as otherwise
@@ -1449,7 +1447,9 @@  i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	 */
 	if (!i915_gem_object_has_struct_page(obj) ||
 	    cpu_write_needs_clflush(obj)) {
+		intel_runtime_pm_get(dev_priv);
 		ret = i915_gem_gtt_pwrite_fast(dev_priv, obj, args, file);
+		intel_runtime_pm_put(dev_priv);
 		/* Note that the gtt paths might fail with non-page-backed user
 		 * pointers (e.g. gtt mappings when moving data between
 		 * textures). Fallback to the shmem path in that case. */
@@ -1464,12 +1464,8 @@  i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 	i915_gem_object_put(obj);
 	mutex_unlock(&dev->struct_mutex);
-	intel_runtime_pm_put(dev_priv);
-
 	return ret;
 
-err_rpm:
-	intel_runtime_pm_put(dev_priv);
 err:
 	i915_gem_object_put_unlocked(obj);
 	return ret;
@@ -1833,9 +1829,13 @@  i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	/* Serialisation between user GTT access and our code depends upon
 	 * revoking the CPU's PTE whilst the mutex is held. The next user
 	 * pagefault then has to wait until we release the mutex.
+	 *
+	 * Note that RPM complicates somewhat by adding an additional
+	 * requirement that operations to the GGTT be made holding the RPM
+	 * wakeref. This in turns allow us to release the mmap from within
+	 * the RPM suspend code ignoring the struct_mutex serialisation in
+	 * lieu of the RPM barriers.
 	 */
-	lockdep_assert_held(&obj->base.dev->struct_mutex);
-
 	if (!obj->fault_mappable)
 		return;
 
@@ -1854,11 +1854,21 @@  i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	obj->fault_mappable = false;
 }
 
+static void assert_rpm_release_all_mmaps(struct drm_i915_private *dev_priv)
+{
+	assert_rpm_wakelock_held(dev_priv);
+}
+
 void
 i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_gem_object *obj;
 
+	/* This should only be called by RPM as we require the bound_list
+	 * to be protected by the RPM barriers and not struct_mutex.
+	 * We check that we are holding the wakeref whenever we manipulate
+	 * the dev_priv->mm.bound_list (via assert_rpm_release_all_mmaps).
+	 */
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
 		i915_gem_release_mmap(obj);
 }
@@ -2402,9 +2412,11 @@  i915_gem_object_retire__read(struct i915_gem_active *active,
 	 * so that we don't steal from recently used but inactive objects
 	 * (unless we are forced to ofc!)
 	 */
-	if (obj->bind_count)
+	if (obj->bind_count) {
+		assert_rpm_release_all_mmaps(request->i915);
 		list_move_tail(&obj->global_list,
 			       &request->i915->mm.bound_list);
+	}
 
 	if (i915_gem_object_has_active_reference(obj)) {
 		i915_gem_object_clear_active_reference(obj);
@@ -2881,9 +2893,11 @@  int i915_vma_unbind(struct i915_vma *vma)
 
 	/* Since the unbound list is global, only move to that list if
 	 * no more VMAs exist. */
-	if (--obj->bind_count == 0)
+	if (--obj->bind_count == 0) {
+		assert_rpm_release_all_mmaps(to_i915(obj->base.dev));
 		list_move_tail(&obj->global_list,
 			       &to_i915(obj->base.dev)->mm.unbound_list);
+	}
 
 	/* And finally now the object is completely decoupled from this vma,
 	 * we can drop its hold on the backing storage and allow it to be
@@ -3071,6 +3085,7 @@  search_free:
 	}
 	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, obj->cache_level));
 
+	assert_rpm_release_all_mmaps(dev_priv);
 	list_move_tail(&obj->global_list, &dev_priv->mm.bound_list);
 	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
 	obj->bind_count++;
@@ -3420,7 +3435,6 @@  int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_gem_caching *args = data;
 	struct drm_i915_gem_object *obj;
 	enum i915_cache_level level;
@@ -3449,11 +3463,9 @@  int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	intel_runtime_pm_get(dev_priv);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		goto rpm_put;
+		return ret;
 
 	obj = i915_gem_object_lookup(file, args->handle);
 	if (!obj) {
@@ -3462,13 +3474,9 @@  int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 	}
 
 	ret = i915_gem_object_set_cache_level(obj, level);
-
 	i915_gem_object_put(obj);
 unlock:
 	mutex_unlock(&dev->struct_mutex);
-rpm_put:
-	intel_runtime_pm_put(dev_priv);
-
 	return ret;
 }
 
@@ -4174,8 +4182,6 @@  void i915_gem_free_object(struct drm_gem_object *gem_obj)
 
 	kfree(obj->bit_17);
 	i915_gem_object_free(obj);
-
-	intel_runtime_pm_put(dev_priv);
 }
 
 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index fe7f9887ee67..67a3ff960b0d 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2594,6 +2594,7 @@  static int ggtt_bind_vma(struct i915_vma *vma,
 			 enum i915_cache_level cache_level,
 			 u32 flags)
 {
+	struct drm_i915_private *i915 = to_i915(vma->vm->dev);
 	struct drm_i915_gem_object *obj = vma->obj;
 	u32 pte_flags = 0;
 	int ret;
@@ -2606,8 +2607,10 @@  static int ggtt_bind_vma(struct i915_vma *vma,
 	if (obj->gt_ro)
 		pte_flags |= PTE_READ_ONLY;
 
+	intel_runtime_pm_get(i915);
 	vma->vm->insert_entries(vma->vm, vma->pages, vma->node.start,
 				cache_level, pte_flags);
+	intel_runtime_pm_get(i915);
 
 	/*
 	 * Without aliasing PPGTT there's no difference between
@@ -2623,6 +2626,7 @@  static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 				 enum i915_cache_level cache_level,
 				 u32 flags)
 {
+	struct drm_i915_private *i915 = to_i915(vma->vm->dev);
 	u32 pte_flags;
 	int ret;
 
@@ -2637,14 +2641,15 @@  static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 
 
 	if (flags & I915_VMA_GLOBAL_BIND) {
+		intel_runtime_pm_get(i915);
 		vma->vm->insert_entries(vma->vm,
 					vma->pages, vma->node.start,
 					cache_level, pte_flags);
+		intel_runtime_pm_put(i915);
 	}
 
 	if (flags & I915_VMA_LOCAL_BIND) {
-		struct i915_hw_ppgtt *appgtt =
-			to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
+		struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
 		appgtt->base.insert_entries(&appgtt->base,
 					    vma->pages, vma->node.start,
 					    cache_level, pte_flags);
@@ -2655,13 +2660,17 @@  static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 
 static void ggtt_unbind_vma(struct i915_vma *vma)
 {
-	struct i915_hw_ppgtt *appgtt = to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
+	struct drm_i915_private *i915 = to_i915(vma->vm->dev);
+	struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
 	const u64 size = min(vma->size, vma->node.size);
 
-	if (vma->flags & I915_VMA_GLOBAL_BIND)
+	if (vma->flags & I915_VMA_GLOBAL_BIND) {
+		intel_runtime_pm_get(i915);
 		vma->vm->clear_range(vma->vm,
 				     vma->node.start, size,
 				     true);
+		intel_runtime_pm_put(i915);
+	}
 
 	if (vma->flags & I915_VMA_LOCAL_BIND && appgtt)
 		appgtt->base.clear_range(&appgtt->base,
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index a14b1e3d4c78..08f796a4f5f6 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -204,8 +204,6 @@  i915_gem_set_tiling(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	intel_runtime_pm_get(dev_priv);
-
 	mutex_lock(&dev->struct_mutex);
 	if (obj->pin_display || obj->framebuffer_references) {
 		err = -EBUSY;
@@ -301,8 +299,6 @@  err:
 	i915_gem_object_put(obj);
 	mutex_unlock(&dev->struct_mutex);
 
-	intel_runtime_pm_put(dev_priv);
-
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ebb83d5a448b..3d9c2a21dfbd 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2523,7 +2523,6 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 		 * simulated reset via debugs, so get an RPM reference.
 		 */
 		intel_runtime_pm_get(dev_priv);
-
 		intel_prepare_reset(dev_priv);
 
 		/*
@@ -2535,7 +2534,6 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 		ret = i915_reset(dev_priv);
 
 		intel_finish_reset(dev_priv);
-
 		intel_runtime_pm_put(dev_priv);
 
 		if (ret == 0)
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 43f833901b8e..a6b04da4bf21 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1414,7 +1414,7 @@  int i915_reg_read_ioctl(struct drm_device *dev,
 	struct register_whitelist const *entry = whitelist;
 	unsigned size;
 	i915_reg_t offset_ldw, offset_udw;
-	int i, ret = 0;
+	int i, ret;
 
 	for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {
 		if (i915_mmio_reg_offset(entry->offset_ldw) == (reg->offset & -entry->size) &&
@@ -1436,6 +1436,7 @@  int i915_reg_read_ioctl(struct drm_device *dev,
 
 	intel_runtime_pm_get(dev_priv);
 
+	ret = 0;
 	switch (size) {
 	case 8 | 1:
 		reg->val = I915_READ64_2x32(offset_ldw, offset_udw);
@@ -1454,10 +1455,9 @@  int i915_reg_read_ioctl(struct drm_device *dev,
 		break;
 	default:
 		ret = -EINVAL;
-		goto out;
+		break;
 	}
 
-out:
 	intel_runtime_pm_put(dev_priv);
 	return ret;
 }