Message ID | 20210203083841.19735-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Apply VT-d scanout adjustment to the VMA | expand |
On Wed, Feb 03, 2021 at 08:38:41AM +0000, Chris Wilson wrote: > Currently, we allocate exactly the VMA requested for the framebuffer and > rely on filling the whole of the GGTT with scratch pages to catch when > VT-d prefetches beyond the bounds of the surface. However, this means > that we have to scrub the GGTT on startup and resume, and on recent HW > this is made even slower as the only access to GSM is uncached. > > Since we do fill the pad-to-size PTE with scratch pages, and this is > also reapplied on resume, we can forgo the overzealous clearing of the > entire GGTT and instead pad the VMA to avoid the VT-d prefetches going > outside of the VMA. We have a lot of these "allocate a huge pile of extra PTEs after (or before for rotation) the scanout surface" workarounds we're not handling at all. Not sure ths is sufficient to cover those. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/gem/i915_gem_domain.c | 18 ++++++++++++----- > drivers/gpu/drm/i915/gt/intel_ggtt.c | 23 ---------------------- > 2 files changed, 13 insertions(+), 28 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c > index 36f54cedaaeb..2668a79383c8 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c > @@ -359,19 +359,26 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data, > */ > struct i915_vma * > i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, > - u32 alignment, > + u32 align, > const struct i915_ggtt_view *view, > unsigned int flags) > { > struct drm_i915_private *i915 = to_i915(obj->base.dev); > struct i915_gem_ww_ctx ww; > struct i915_vma *vma; > + u64 size; > int ret; > > /* Frame buffer must be in LMEM (no migration yet) */ > if (HAS_LMEM(i915) && !i915_gem_object_is_lmem(obj)) > return ERR_PTR(-EINVAL); > > + size = obj->base.size; > + if (intel_scanout_needs_vtd_wa(i915)) { > + size = ALIGN(size, SZ_256K); > + align = ALIGN(align, SZ_256K); > + } > + > i915_gem_ww_ctx_init(&ww, true); > retry: > ret = i915_gem_object_lock(obj, &ww); > @@ -404,18 +411,19 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, > vma = ERR_PTR(-ENOSPC); > if ((flags & PIN_MAPPABLE) == 0 && > (!view || view->type == I915_GGTT_VIEW_NORMAL)) > - vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, alignment, > + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, > + size, align, > flags | PIN_MAPPABLE | > PIN_NONBLOCK); > if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK)) > - vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, > - alignment, flags); > + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, > + size, align, flags); > if (IS_ERR(vma)) { > ret = PTR_ERR(vma); > goto err; > } > > - vma->display_alignment = max_t(u64, vma->display_alignment, alignment); > + vma->display_alignment = max_t(u64, vma->display_alignment, align); > i915_vma_mark_scanout(vma); > > i915_gem_object_flush_if_display_locked(obj); > diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c > index fc399ac16eda..126b061862ad 100644 > --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c > +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c > @@ -304,27 +304,6 @@ static void nop_clear_range(struct i915_address_space *vm, > { > } > > -static void gen8_ggtt_clear_range(struct i915_address_space *vm, > - u64 start, u64 length) > -{ > - struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > - unsigned int first_entry = start / I915_GTT_PAGE_SIZE; > - unsigned int num_entries = length / I915_GTT_PAGE_SIZE; > - const gen8_pte_t scratch_pte = vm->scratch[0]->encode; > - gen8_pte_t __iomem *gtt_base = > - (gen8_pte_t __iomem *)ggtt->gsm + first_entry; > - const int max_entries = ggtt_total_entries(ggtt) - first_entry; > - int i; > - > - if (WARN(num_entries > max_entries, > - "First entry = %d; Num entries = %d (max=%d)\n", > - first_entry, num_entries, max_entries)) > - num_entries = max_entries; > - > - for (i = 0; i < num_entries; i++) > - gen8_set_pte(>t_base[i], scratch_pte); > -} > - > static void bxt_vtd_ggtt_wa(struct i915_address_space *vm) > { > /* > @@ -884,8 +863,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) > ggtt->vm.cleanup = gen6_gmch_remove; > ggtt->vm.insert_page = gen8_ggtt_insert_page; > ggtt->vm.clear_range = nop_clear_range; > - if (intel_scanout_needs_vtd_wa(i915)) > - ggtt->vm.clear_range = gen8_ggtt_clear_range; > > ggtt->vm.insert_entries = gen8_ggtt_insert_entries; > > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Quoting Ville Syrjälä (2021-02-03 09:00:54) > On Wed, Feb 03, 2021 at 08:38:41AM +0000, Chris Wilson wrote: > > Currently, we allocate exactly the VMA requested for the framebuffer and > > rely on filling the whole of the GGTT with scratch pages to catch when > > VT-d prefetches beyond the bounds of the surface. However, this means > > that we have to scrub the GGTT on startup and resume, and on recent HW > > this is made even slower as the only access to GSM is uncached. > > > > Since we do fill the pad-to-size PTE with scratch pages, and this is > > also reapplied on resume, we can forgo the overzealous clearing of the > > entire GGTT and instead pad the VMA to avoid the VT-d prefetches going > > outside of the VMA. > > We have a lot of these "allocate a huge pile of extra PTEs > after (or before for rotation) the scanout surface" workarounds > we're not handling at all. Not sure ths is sufficient to cover > those. I am being optimistic in that we should have a lot of weird cases covered in CI and with iommu enabled, we will get splats. But if it's pages/after before regardless of alignment, then this is insufficient. Also, the case of remapped pages we don't want obj->size here, but vma->size. -Chris
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 36f54cedaaeb..2668a79383c8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -359,19 +359,26 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data, */ struct i915_vma * i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, - u32 alignment, + u32 align, const struct i915_ggtt_view *view, unsigned int flags) { struct drm_i915_private *i915 = to_i915(obj->base.dev); struct i915_gem_ww_ctx ww; struct i915_vma *vma; + u64 size; int ret; /* Frame buffer must be in LMEM (no migration yet) */ if (HAS_LMEM(i915) && !i915_gem_object_is_lmem(obj)) return ERR_PTR(-EINVAL); + size = obj->base.size; + if (intel_scanout_needs_vtd_wa(i915)) { + size = ALIGN(size, SZ_256K); + align = ALIGN(align, SZ_256K); + } + i915_gem_ww_ctx_init(&ww, true); retry: ret = i915_gem_object_lock(obj, &ww); @@ -404,18 +411,19 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, vma = ERR_PTR(-ENOSPC); if ((flags & PIN_MAPPABLE) == 0 && (!view || view->type == I915_GGTT_VIEW_NORMAL)) - vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, alignment, + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, + size, align, flags | PIN_MAPPABLE | PIN_NONBLOCK); if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK)) - vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, - alignment, flags); + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, + size, align, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto err; } - vma->display_alignment = max_t(u64, vma->display_alignment, alignment); + vma->display_alignment = max_t(u64, vma->display_alignment, align); i915_vma_mark_scanout(vma); i915_gem_object_flush_if_display_locked(obj); diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index fc399ac16eda..126b061862ad 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -304,27 +304,6 @@ static void nop_clear_range(struct i915_address_space *vm, { } -static void gen8_ggtt_clear_range(struct i915_address_space *vm, - u64 start, u64 length) -{ - struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); - unsigned int first_entry = start / I915_GTT_PAGE_SIZE; - unsigned int num_entries = length / I915_GTT_PAGE_SIZE; - const gen8_pte_t scratch_pte = vm->scratch[0]->encode; - gen8_pte_t __iomem *gtt_base = - (gen8_pte_t __iomem *)ggtt->gsm + first_entry; - const int max_entries = ggtt_total_entries(ggtt) - first_entry; - int i; - - if (WARN(num_entries > max_entries, - "First entry = %d; Num entries = %d (max=%d)\n", - first_entry, num_entries, max_entries)) - num_entries = max_entries; - - for (i = 0; i < num_entries; i++) - gen8_set_pte(>t_base[i], scratch_pte); -} - static void bxt_vtd_ggtt_wa(struct i915_address_space *vm) { /* @@ -884,8 +863,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) ggtt->vm.cleanup = gen6_gmch_remove; ggtt->vm.insert_page = gen8_ggtt_insert_page; ggtt->vm.clear_range = nop_clear_range; - if (intel_scanout_needs_vtd_wa(i915)) - ggtt->vm.clear_range = gen8_ggtt_clear_range; ggtt->vm.insert_entries = gen8_ggtt_insert_entries;
Currently, we allocate exactly the VMA requested for the framebuffer and rely on filling the whole of the GGTT with scratch pages to catch when VT-d prefetches beyond the bounds of the surface. However, this means that we have to scrub the GGTT on startup and resume, and on recent HW this is made even slower as the only access to GSM is uncached. Since we do fill the pad-to-size PTE with scratch pages, and this is also reapplied on resume, we can forgo the overzealous clearing of the entire GGTT and instead pad the VMA to avoid the VT-d prefetches going outside of the VMA. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 18 ++++++++++++----- drivers/gpu/drm/i915/gt/intel_ggtt.c | 23 ---------------------- 2 files changed, 13 insertions(+), 28 deletions(-)