[11/20] drm/i915: Rely on spinlock protection for GPU error capture
diff mbox series

Message ID 20190718070024.21781-11-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/20] drm/i915: Move aliasing_ppgtt underneath its i915_ggtt
Related show

Commit Message

Chris Wilson July 18, 2019, 7 a.m. UTC
Trust that we now have adequate protection over the low level structures
via the engine->active.lock to allow ourselves to capture the GPU error
state without the heavy hammer of stop_machine(). Sadly this does mean
that we have to forgo some of the lesser used information (not derived
from the active state) that is not controlled by the active locks. This
includes the list of buffers in the ppGTT and pinned globally in the
GGTT. Originally this was used to manually verify relocations, but
hasn't been required for sometime and modern mesa now has the habit of
ensuring that all interesting buffers within a batch are captured in their
entirety (that are the auxiliary state buffers, but not the textures).

A useful side-effect is that this allows us to restore error capturing
for Braswell and Broxton.

v2: Use pagevec for a typical random number

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c   |   5 -
 drivers/gpu/drm/i915/i915_gpu_error.c | 499 +++++++++++---------------
 drivers/gpu/drm/i915/i915_gpu_error.h |  17 -
 3 files changed, 203 insertions(+), 318 deletions(-)

Comments

Tvrtko Ursulin July 22, 2019, 1:40 p.m. UTC | #1
On 18/07/2019 08:00, Chris Wilson wrote:
> Trust that we now have adequate protection over the low level structures
> via the engine->active.lock to allow ourselves to capture the GPU error
> state without the heavy hammer of stop_machine(). Sadly this does mean
> that we have to forgo some of the lesser used information (not derived
> from the active state) that is not controlled by the active locks. This
> includes the list of buffers in the ppGTT and pinned globally in the
> GGTT. Originally this was used to manually verify relocations, but
> hasn't been required for sometime and modern mesa now has the habit of
> ensuring that all interesting buffers within a batch are captured in their
> entirety (that are the auxiliary state buffers, but not the textures).
> 
> A useful side-effect is that this allows us to restore error capturing
> for Braswell and Broxton.
> 
> v2: Use pagevec for a typical random number
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c   |   5 -
>   drivers/gpu/drm/i915/i915_gpu_error.c | 499 +++++++++++---------------
>   drivers/gpu/drm/i915/i915_gpu_error.h |  17 -
>   3 files changed, 203 insertions(+), 318 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 8f33d3e52b9e..500f3cdcc4b3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2966,11 +2966,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>   		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
>   		if (ggtt->vm.clear_range != nop_clear_range)
>   			ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
> -
> -		/* Prevent recursively calling stop_machine() and deadlocks. */
> -		dev_info(dev_priv->drm.dev,
> -			 "Disabling error capture for VT-d workaround\n");
> -		i915_disable_error_state(dev_priv, -ENODEV);
>   	}
>   
>   	ggtt->invalidate = gen6_ggtt_invalidate;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 24835be300bc..131a52c428de 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -29,8 +29,8 @@
>   
>   #include <linux/ascii85.h>
>   #include <linux/nmi.h>
> +#include <linux/pagevec.h>
>   #include <linux/scatterlist.h>
> -#include <linux/stop_machine.h>
>   #include <linux/utsname.h>
>   #include <linux/zlib.h>
>   
> @@ -46,6 +46,9 @@
>   #include "i915_scatterlist.h"
>   #include "intel_csr.h"
>   
> +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
> +
>   static inline const struct intel_engine_cs *
>   engine_lookup(const struct drm_i915_private *i915, unsigned int id)
>   {
> @@ -67,26 +70,6 @@ engine_name(const struct drm_i915_private *i915, unsigned int id)
>   	return __engine_name(engine_lookup(i915, id));
>   }
>   
> -static const char *tiling_flag(int tiling)
> -{
> -	switch (tiling) {
> -	default:
> -	case I915_TILING_NONE: return "";
> -	case I915_TILING_X: return " X";
> -	case I915_TILING_Y: return " Y";
> -	}
> -}
> -
> -static const char *dirty_flag(int dirty)
> -{
> -	return dirty ? " dirty" : "";
> -}
> -
> -static const char *purgeable_flag(int purgeable)
> -{
> -	return purgeable ? " purgeable" : "";
> -}
> -
>   static void __sg_set_buf(struct scatterlist *sg,
>   			 void *addr, unsigned int len, loff_t it)
>   {
> @@ -114,7 +97,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	if (e->cur == e->end) {
>   		struct scatterlist *sgl;
>   
> -		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
> +		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
>   		if (!sgl) {
>   			e->err = -ENOMEM;
>   			return false;
> @@ -134,7 +117,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
>   	}
>   
>   	e->size = ALIGN(len + 1, SZ_64K);
> -	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
> +	e->buf = kmalloc(e->size, ALLOW_FAIL);
>   	if (!e->buf) {
>   		e->size = PAGE_ALIGN(len + 1);
>   		e->buf = kmalloc(e->size, GFP_KERNEL);
> @@ -211,47 +194,125 @@ i915_error_printer(struct drm_i915_error_state_buf *e)
>   	return p;
>   }
>   
> +/* single threaded page allocator with a reserved stash for emergencies */
> +static void *__alloc_page(gfp_t gfp)
> +{
> +	return (void *)__get_free_page(gfp);
> +}
> +
> +static void pool_fini(struct pagevec *pv)
> +{
> +	pagevec_release(pv);
> +}
> +
> +static int pool_refill(struct pagevec *pv, gfp_t gfp)
> +{
> +	while (pagevec_space(pv)) {
> +		struct page *p;
> +
> +		p = alloc_page(gfp);
> +		if (!p)
> +			return -ENOMEM;
> +
> +		pagevec_add(pv, p);
> +	}
> +
> +	return 0;
> +}
> +
> +static int pool_init(struct pagevec *pv, gfp_t gfp)
> +{
> +	int err;
> +
> +	pagevec_init(pv);
> +
> +	err = pool_refill(pv, gfp);
> +	if (err)
> +		pool_fini(pv);
> +
> +	return err;
> +}
> +
> +static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
> +{
> +	void *ptr;
> +
> +	ptr = __alloc_page(gfp);
> +	if (ptr)
> +		return ptr;
> +
> +	if (!pagevec_count(pv))
> +		return NULL;
> +
> +	return page_address(pv->pages[--pv->nr]);
> +}
> +
> +static void pool_free(struct pagevec *pv, void *addr)
> +{
> +	struct page *p = virt_to_page(addr);
> +
> +	if (pagevec_space(pv)) {
> +		pv->pages[pv->nr++] = p;

pagevec_add(pv, p)

> +		return;
> +	}
> +
> +	__free_pages(p, 0);

if-else would probably me more readable, instead of if-return. Also 
could use __free_page to avoid ", 0".

> +}
> +
>   #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>   
>   struct compress {
> +	struct pagevec pool;
>   	struct z_stream_s zstream;
>   	void *tmp;
>   };
>   
>   static bool compress_init(struct compress *c)
>   {
> -	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
> +	struct z_stream_s *zstream = &c->zstream;
>   
> -	zstream->workspace =
> -		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> -			GFP_ATOMIC | __GFP_NOWARN);
> -	if (!zstream->workspace)
> +	if (pool_init(&c->pool, ALLOW_FAIL))
>   		return false;
>   
> -	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
> -		kfree(zstream->workspace);
> +	zstream->workspace =
> +		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
> +			ALLOW_FAIL);
> +	if (!zstream->workspace) {
> +		pool_fini(&c->pool);
>   		return false;
>   	}
>   
>   	c->tmp = NULL;
>   	if (i915_has_memcpy_from_wc())
> -		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
>   
>   	return true;
>   }
>   
> -static void *compress_next_page(struct drm_i915_error_object *dst)
> +static bool compress_start(struct compress *c)
>   {
> -	unsigned long page;
> +	struct z_stream_s *zstream = &c->zstream;
> +	void *workspace = zstream->workspace;
> +
> +	memset(zstream, 0, sizeof(*zstream));
> +	zstream->workspace = workspace;
> +
> +	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
> +}
> +
> +static void *compress_next_page(struct compress *c,
> +				struct drm_i915_error_object *dst)
> +{
> +	void *page;
>   
>   	if (dst->page_count >= dst->num_pages)
>   		return ERR_PTR(-ENOSPC);
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> +	page = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
>   	if (!page)
>   		return ERR_PTR(-ENOMEM);
>   
> -	return dst->pages[dst->page_count++] = (void *)page;
> +	return dst->pages[dst->page_count++] = page;
>   }
>   
>   static int compress_page(struct compress *c,
> @@ -267,7 +328,7 @@ static int compress_page(struct compress *c,
>   
>   	do {
>   		if (zstream->avail_out == 0) {
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -295,7 +356,7 @@ static int compress_flush(struct compress *c,
>   	do {
>   		switch (zlib_deflate(zstream, Z_FINISH)) {
>   		case Z_OK: /* more space requested */
> -			zstream->next_out = compress_next_page(dst);
> +			zstream->next_out = compress_next_page(c, dst);
>   			if (IS_ERR(zstream->next_out))
>   				return PTR_ERR(zstream->next_out);
>   
> @@ -316,15 +377,17 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
>   {
> -	struct z_stream_s *zstream = &c->zstream;
> +	zlib_deflateEnd(&c->zstream);
> +}
>   
> -	zlib_deflateEnd(zstream);
> -	kfree(zstream->workspace);
> +static void compress_fini(struct compress *c)
> +{
> +	kfree(c->zstream.workspace);
>   	if (c->tmp)
> -		free_page((unsigned long)c->tmp);
> +		pool_free(&c->pool, c->tmp);
> +	pool_fini(&c->pool);
>   }
>   
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -335,9 +398,15 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   #else
>   
>   struct compress {
> +	struct pagevec pool;
>   };
>   
>   static bool compress_init(struct compress *c)
> +{
> +	return pool_init(&c->pool, ALLOW_FAIL) == 0;
> +}
> +
> +static bool compress_start(struct compress *c)
>   {
>   	return true;
>   }
> @@ -346,14 +415,12 @@ static int compress_page(struct compress *c,
>   			 void *src,
>   			 struct drm_i915_error_object *dst)
>   {
> -	unsigned long page;
>   	void *ptr;
>   
> -	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
> -	if (!page)
> +	ptr = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
> +	if (!ptr)
>   		return -ENOMEM;
>   
> -	ptr = (void *)page;
>   	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
>   		memcpy(ptr, src, PAGE_SIZE);
>   	dst->pages[dst->page_count++] = ptr;
> @@ -367,9 +434,13 @@ static int compress_flush(struct compress *c,
>   	return 0;
>   }
>   
> -static void compress_fini(struct compress *c,
> -			  struct drm_i915_error_object *dst)
> +static void compress_finish(struct compress *c)
> +{
> +}
> +
> +static void compress_fini(struct compress *c)
>   {
> +	pool_fini(&c->pool);
>   }
>   
>   static void err_compression_marker(struct drm_i915_error_state_buf *m)
> @@ -379,36 +450,6 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
>   
>   #endif
>   
> -static void print_error_buffers(struct drm_i915_error_state_buf *m,
> -				const char *name,
> -				struct drm_i915_error_buffer *err,
> -				int count)
> -{
> -	err_printf(m, "%s [%d]:\n", name, count);
> -
> -	while (count--) {
> -		err_printf(m, "    %08x_%08x %8u %02x %02x",
> -			   upper_32_bits(err->gtt_offset),
> -			   lower_32_bits(err->gtt_offset),
> -			   err->size,
> -			   err->read_domains,
> -			   err->write_domain);
> -		err_puts(m, tiling_flag(err->tiling));
> -		err_puts(m, dirty_flag(err->dirty));
> -		err_puts(m, purgeable_flag(err->purgeable));
> -		err_puts(m, err->userptr ? " userptr" : "");
> -		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
> -
> -		if (err->name)
> -			err_printf(m, " (name: %d)", err->name);
> -		if (err->fence_reg != I915_FENCE_REG_NONE)
> -			err_printf(m, " (fence: %d)", err->fence_reg);
> -
> -		err_puts(m, "\n");
> -		err++;
> -	}
> -}
> -
>   static void error_print_instdone(struct drm_i915_error_state_buf *m,
>   				 const struct drm_i915_error_engine *ee)
>   {
> @@ -734,33 +775,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>   			error_print_engine(m, &error->engine[i], error->epoch);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
> -		char buf[128];
> -		int len, first = 1;
> -
> -		if (!error->active_vm[i])
> -			break;
> -
> -		len = scnprintf(buf, sizeof(buf), "Active (");
> -		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
> -			if (error->engine[j].vm != error->active_vm[i])
> -				continue;
> -
> -			len += scnprintf(buf + len, sizeof(buf), "%s%s",
> -					 first ? "" : ", ",
> -					 m->i915->engine[j]->name);
> -			first = 0;
> -		}
> -		scnprintf(buf + len, sizeof(buf), ")");
> -		print_error_buffers(m, buf,
> -				    error->active_bo[i],
> -				    error->active_bo_count[i]);
> -	}
> -
> -	print_error_buffers(m, "Pinned (global)",
> -			    error->pinned_bo,
> -			    error->pinned_bo_count);
> -
>   	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>   		const struct drm_i915_error_engine *ee = &error->engine[i];
>   
> @@ -974,10 +988,6 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   		kfree(ee->requests);
>   	}
>   
> -	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
> -		kfree(error->active_bo[i]);
> -	kfree(error->pinned_bo);
> -
>   	kfree(error->overlay);
>   	kfree(error->display);
>   
> @@ -990,12 +1000,12 @@ void __i915_gpu_state_free(struct kref *error_ref)
>   
>   static struct drm_i915_error_object *
>   i915_error_object_create(struct drm_i915_private *i915,
> -			 struct i915_vma *vma)
> +			 struct i915_vma *vma,
> +			 struct compress *compress)
>   {
>   	struct i915_ggtt *ggtt = &i915->ggtt;
>   	const u64 slot = ggtt->error_capture.start;
>   	struct drm_i915_error_object *dst;
> -	struct compress compress;
>   	unsigned long num_pages;
>   	struct sgt_iter iter;
>   	dma_addr_t dma;
> @@ -1006,22 +1016,21 @@ i915_error_object_create(struct drm_i915_private *i915,
>   
>   	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
>   	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
> -	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
> -		      GFP_ATOMIC | __GFP_NOWARN);
> +	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ATOMIC_MAYFAIL);
>   	if (!dst)
>   		return NULL;
>   
> +	if (!compress_start(compress)) {
> +		kfree(dst);
> +		return NULL;
> +	}
> +
>   	dst->gtt_offset = vma->node.start;
>   	dst->gtt_size = vma->node.size;
>   	dst->num_pages = num_pages;
>   	dst->page_count = 0;
>   	dst->unused = 0;
>   
> -	if (!compress_init(&compress)) {
> -		kfree(dst);
> -		return NULL;
> -	}
> -
>   	ret = -EINVAL;
>   	for_each_sgt_dma(dma, iter, vma->pages) {
>   		void __iomem *s;
> @@ -1029,69 +1038,23 @@ i915_error_object_create(struct drm_i915_private *i915,
>   		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
>   
>   		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
> -		ret = compress_page(&compress, (void  __force *)s, dst);
> +		ret = compress_page(compress, (void  __force *)s, dst);
>   		io_mapping_unmap_atomic(s);
>   		if (ret)
>   			break;
>   	}
>   
> -	if (ret || compress_flush(&compress, dst)) {
> +	if (ret || compress_flush(compress, dst)) {
>   		while (dst->page_count--)
> -			free_page((unsigned long)dst->pages[dst->page_count]);
> +			pool_free(&compress->pool, dst->pages[dst->page_count]);
>   		kfree(dst);
>   		dst = NULL;
>   	}
> +	compress_finish(compress);
>   
> -	compress_fini(&compress, dst);
>   	return dst;
>   }
>   
> -static void capture_bo(struct drm_i915_error_buffer *err,
> -		       struct i915_vma *vma)
> -{
> -	struct drm_i915_gem_object *obj = vma->obj;
> -
> -	err->size = obj->base.size;
> -	err->name = obj->base.name;
> -
> -	err->gtt_offset = vma->node.start;
> -	err->read_domains = obj->read_domains;
> -	err->write_domain = obj->write_domain;
> -	err->fence_reg = vma->fence ? vma->fence->id : -1;
> -	err->tiling = i915_gem_object_get_tiling(obj);
> -	err->dirty = obj->mm.dirty;
> -	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
> -	err->userptr = obj->userptr.mm != NULL;
> -	err->cache_level = obj->cache_level;
> -}
> -
> -static u32 capture_error_bo(struct drm_i915_error_buffer *err,
> -			    int count, struct list_head *head,
> -			    unsigned int flags)
> -#define ACTIVE_ONLY BIT(0)
> -#define PINNED_ONLY BIT(1)
> -{
> -	struct i915_vma *vma;
> -	int i = 0;
> -
> -	list_for_each_entry(vma, head, vm_link) {
> -		if (!vma->obj)
> -			continue;
> -
> -		if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
> -			continue;
> -
> -		if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
> -			continue;
> -
> -		capture_bo(err++, vma);
> -		if (++i == count)
> -			break;
> -	}
> -
> -	return i;
> -}
> -
>   /*
>    * Generate a semi-unique error code. The code is not meant to have meaning, The
>    * code's only purpose is to try to prevent false duplicated bug reports by
> @@ -1281,7 +1244,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>   	if (!count)
>   		return;
>   
> -	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
> +	ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
>   	if (!ee->requests)
>   		return;
>   
> @@ -1349,8 +1312,10 @@ static void record_context(struct drm_i915_error_context *e,
>   	e->active = atomic_read(&ctx->active_count);
>   }
>   
> -static void request_record_user_bo(struct i915_request *request,
> -				   struct drm_i915_error_engine *ee)
> +static void
> +request_record_user_bo(struct i915_request *request,
> +		       struct drm_i915_error_engine *ee,
> +		       struct compress *compress)
>   {
>   	struct i915_capture_list *c;
>   	struct drm_i915_error_object **bo;
> @@ -1362,18 +1327,20 @@ static void request_record_user_bo(struct i915_request *request,
>   	if (!max)
>   		return;
>   
> -	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	if (!bo) {
>   		/* If we can't capture everything, try to capture something. */
>   		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
> -		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
> +		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
>   	}
>   	if (!bo)
>   		return;
>   
>   	count = 0;
>   	for (c = request->capture_list; c; c = c->next) {
> -		bo[count] = i915_error_object_create(request->i915, c->vma);
> +		bo[count] = i915_error_object_create(request->i915,
> +						     c->vma,
> +						     compress);
>   		if (!bo[count])
>   			break;
>   		if (++count == max)
> @@ -1386,7 +1353,8 @@ static void request_record_user_bo(struct i915_request *request,
>   
>   static struct drm_i915_error_object *
>   capture_object(struct drm_i915_private *dev_priv,
> -	       struct drm_i915_gem_object *obj)
> +	       struct drm_i915_gem_object *obj,
> +	       struct compress *compress)
>   {
>   	if (obj && i915_gem_object_has_pages(obj)) {
>   		struct i915_vma fake = {
> @@ -1396,13 +1364,14 @@ capture_object(struct drm_i915_private *dev_priv,
>   			.obj = obj,
>   		};
>   
> -		return i915_error_object_create(dev_priv, &fake);
> +		return i915_error_object_create(dev_priv, &fake, compress);
>   	} else {
>   		return NULL;
>   	}
>   }
>   
> -static void gem_record_rings(struct i915_gpu_state *error)
> +static void
> +gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	int i;
> @@ -1420,6 +1389,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->engine_id = i;
>   
> +		/* Refill our page pool before entering atomic section */
> +		pool_refill(&compress->pool, ALLOW_FAIL);
> +
>   		error_record_engine_registers(error, engine, ee);
>   		error_record_engine_execlists(engine, ee);
>   
> @@ -1429,8 +1401,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			struct i915_gem_context *ctx = request->gem_context;
>   			struct intel_ring *ring = request->ring;
>   
> -			ee->vm = request->hw_context->vm;
> -
>   			record_context(&ee->context, ctx);
>   
>   			/* We need to copy these to an anonymous buffer
> @@ -1438,17 +1408,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			 * by userspace.
>   			 */
>   			ee->batchbuffer =
> -				i915_error_object_create(i915, request->batch);
> +				i915_error_object_create(i915,
> +							 request->batch,
> +							 compress);
>   
>   			if (HAS_BROKEN_CS_TLB(i915))
>   				ee->wa_batchbuffer =
>   				  i915_error_object_create(i915,
> -							   engine->gt->scratch);
> -			request_record_user_bo(request, ee);
> +							   engine->gt->scratch,
> +							   compress);
> +			request_record_user_bo(request, ee, compress);
>   
>   			ee->ctx =
>   				i915_error_object_create(i915,
> -							 request->hw_context->state);
> +							 request->hw_context->state,
> +							 compress);
>   
>   			error->simulated |=
>   				i915_gem_context_no_error_capture(ctx);
> @@ -1460,7 +1434,9 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			ee->cpu_ring_head = ring->head;
>   			ee->cpu_ring_tail = ring->tail;
>   			ee->ringbuffer =
> -				i915_error_object_create(i915, ring->vma);
> +				i915_error_object_create(i915,
> +							 ring->vma,
> +							 compress);
>   
>   			engine_record_requests(engine, request, ee);
>   		}
> @@ -1468,89 +1444,21 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   
>   		ee->hws_page =
>   			i915_error_object_create(i915,
> -						 engine->status_page.vma);
> -
> -		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
> -
> -		ee->default_state = capture_object(i915, engine->default_state);
> -	}
> -}
> -
> -static void gem_capture_vm(struct i915_gpu_state *error,
> -			   struct i915_address_space *vm,
> -			   int idx)
> -{
> -	struct drm_i915_error_buffer *active_bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		if (i915_vma_is_active(vma))
> -			count++;
> -
> -	active_bo = NULL;
> -	if (count)
> -		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
> -	if (active_bo)
> -		count = capture_error_bo(active_bo,
> -					 count, &vm->bound_list,
> -					 ACTIVE_ONLY);
> -	else
> -		count = 0;
> +						 engine->status_page.vma,
> +						 compress);
>   
> -	error->active_vm[idx] = vm;
> -	error->active_bo[idx] = active_bo;
> -	error->active_bo_count[idx] = count;
> -}
> -
> -static void capture_active_buffers(struct i915_gpu_state *error)
> -{
> -	int cnt = 0, i, j;
> -
> -	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
> -	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
> -
> -	/* Scan each engine looking for unique active contexts/vm */
> -	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> -		struct drm_i915_error_engine *ee = &error->engine[i];
> -		bool found;
> -
> -		if (!ee->vm)
> -			continue;
> +		ee->wa_ctx =
> +			i915_error_object_create(i915,
> +						 engine->wa_ctx.vma,
> +						 compress);
>   
> -		found = false;
> -		for (j = 0; j < i && !found; j++)
> -			found = error->engine[j].vm == ee->vm;
> -		if (!found)
> -			gem_capture_vm(error, ee->vm, cnt++);
> +		ee->default_state =
> +			capture_object(i915, engine->default_state, compress);
>   	}
>   }
>   
> -static void capture_pinned_buffers(struct i915_gpu_state *error)
> -{
> -	struct i915_address_space *vm = &error->i915->ggtt.vm;
> -	struct drm_i915_error_buffer *bo;
> -	struct i915_vma *vma;
> -	int count;
> -
> -	count = 0;
> -	list_for_each_entry(vma, &vm->bound_list, vm_link)
> -		count++;
> -
> -	bo = NULL;
> -	if (count)
> -		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
> -	if (!bo)
> -		return;
> -
> -	error->pinned_bo_count =
> -		capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
> -	error->pinned_bo = bo;
> -}
> -
> -static void capture_uc_state(struct i915_gpu_state *error)
> +static void
> +capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
>   {
>   	struct drm_i915_private *i915 = error->i915;
>   	struct i915_error_uc *error_uc = &error->uc;
> @@ -1567,9 +1475,11 @@ static void capture_uc_state(struct i915_gpu_state *error)
>   	 * As modparams are generally accesible from the userspace make
>   	 * explicit copies of the firmware paths.
>   	 */
> -	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, GFP_ATOMIC);
> -	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, GFP_ATOMIC);
> -	error_uc->guc_log = i915_error_object_create(i915, uc->guc.log.vma);
> +	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
> +	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
> +	error_uc->guc_log = i915_error_object_create(i915,
> +						     uc->guc.log.vma,
> +						     compress);
>   }
>   
>   /* Capture all registers which don't fit into another category. */
> @@ -1753,56 +1663,53 @@ static void capture_finish(struct i915_gpu_state *error)
>   	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
>   }
>   
> -static int capture(void *data)
> -{
> -	struct i915_gpu_state *error = data;
> -
> -	error->time = ktime_get_real();
> -	error->boottime = ktime_get_boottime();
> -	error->uptime = ktime_sub(ktime_get(),
> -				  error->i915->gt.last_init_time);
> -	error->capture = jiffies;
> -
> -	capture_params(error);
> -	capture_gen_state(error);
> -	capture_uc_state(error);
> -	capture_reg_state(error);
> -	gem_record_fences(error);
> -	gem_record_rings(error);
> -	capture_active_buffers(error);
> -	capture_pinned_buffers(error);
> -
> -	error->overlay = intel_overlay_capture_error_state(error->i915);
> -	error->display = intel_display_capture_error_state(error->i915);
> -
> -	error->epoch = capture_find_epoch(error);
> -
> -	capture_finish(error);
> -	return 0;
> -}
> -
>   #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
>   
>   struct i915_gpu_state *
>   i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
> +	struct compress compress;
>   
>   	/* Check if GPU capture has been disabled */
>   	error = READ_ONCE(i915->gpu_error.first_error);
>   	if (IS_ERR(error))
>   		return error;
>   
> -	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> +	error = kzalloc(sizeof(*error), ALLOW_FAIL);
>   	if (!error) {
>   		i915_disable_error_state(i915, -ENOMEM);
>   		return ERR_PTR(-ENOMEM);
>   	}
>   
> +	if (!compress_init(&compress)) {
> +		kfree(error);
> +		i915_disable_error_state(i915, -ENOMEM);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>   	kref_init(&error->ref);
>   	error->i915 = i915;
>   
> -	stop_machine(capture, error, NULL);
> +	error->time = ktime_get_real();
> +	error->boottime = ktime_get_boottime();
> +	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
> +	error->capture = jiffies;
> +
> +	capture_params(error);
> +	capture_gen_state(error);
> +	capture_uc_state(error, &compress);
> +	capture_reg_state(error);
> +	gem_record_fences(error);
> +	gem_record_rings(error, &compress);
> +
> +	error->overlay = intel_overlay_capture_error_state(i915);
> +	error->display = intel_display_capture_error_state(i915);
> +
> +	error->epoch = capture_find_epoch(error);
> +
> +	capture_finish(error);
> +	compress_fini(&compress);
>   
>   	return error;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 85f06bc5da05..a24c35107d16 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -85,7 +85,6 @@ struct i915_gpu_state {
>   		/* Software tracked state */
>   		bool idle;
>   		unsigned long hangcheck_timestamp;
> -		struct i915_address_space *vm;
>   		int num_requests;
>   		u32 reset_count;
>   
> @@ -161,22 +160,6 @@ struct i915_gpu_state {
>   		} vm_info;
>   	} engine[I915_NUM_ENGINES];
>   
> -	struct drm_i915_error_buffer {
> -		u32 size;
> -		u32 name;
> -		u64 gtt_offset;
> -		u32 read_domains;
> -		u32 write_domain;
> -		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
> -		u32 tiling:2;
> -		u32 dirty:1;
> -		u32 purgeable:1;
> -		u32 userptr:1;
> -		u32 cache_level:3;
> -	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
> -	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
> -	struct i915_address_space *active_vm[I915_NUM_ENGINES];
> -
>   	struct scatterlist *sgl, *fit;
>   };
>   
> 

Otherwise;

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 8f33d3e52b9e..500f3cdcc4b3 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2966,11 +2966,6 @@  static int gen8_gmch_probe(struct i915_ggtt *ggtt)
 		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
 		if (ggtt->vm.clear_range != nop_clear_range)
 			ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
-
-		/* Prevent recursively calling stop_machine() and deadlocks. */
-		dev_info(dev_priv->drm.dev,
-			 "Disabling error capture for VT-d workaround\n");
-		i915_disable_error_state(dev_priv, -ENODEV);
 	}
 
 	ggtt->invalidate = gen6_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 24835be300bc..131a52c428de 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -29,8 +29,8 @@ 
 
 #include <linux/ascii85.h>
 #include <linux/nmi.h>
+#include <linux/pagevec.h>
 #include <linux/scatterlist.h>
-#include <linux/stop_machine.h>
 #include <linux/utsname.h>
 #include <linux/zlib.h>
 
@@ -46,6 +46,9 @@ 
 #include "i915_scatterlist.h"
 #include "intel_csr.h"
 
+#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
+#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
+
 static inline const struct intel_engine_cs *
 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
 {
@@ -67,26 +70,6 @@  engine_name(const struct drm_i915_private *i915, unsigned int id)
 	return __engine_name(engine_lookup(i915, id));
 }
 
-static const char *tiling_flag(int tiling)
-{
-	switch (tiling) {
-	default:
-	case I915_TILING_NONE: return "";
-	case I915_TILING_X: return " X";
-	case I915_TILING_Y: return " Y";
-	}
-}
-
-static const char *dirty_flag(int dirty)
-{
-	return dirty ? " dirty" : "";
-}
-
-static const char *purgeable_flag(int purgeable)
-{
-	return purgeable ? " purgeable" : "";
-}
-
 static void __sg_set_buf(struct scatterlist *sg,
 			 void *addr, unsigned int len, loff_t it)
 {
@@ -114,7 +97,7 @@  static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
 	if (e->cur == e->end) {
 		struct scatterlist *sgl;
 
-		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
+		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
 		if (!sgl) {
 			e->err = -ENOMEM;
 			return false;
@@ -134,7 +117,7 @@  static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
 	}
 
 	e->size = ALIGN(len + 1, SZ_64K);
-	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+	e->buf = kmalloc(e->size, ALLOW_FAIL);
 	if (!e->buf) {
 		e->size = PAGE_ALIGN(len + 1);
 		e->buf = kmalloc(e->size, GFP_KERNEL);
@@ -211,47 +194,125 @@  i915_error_printer(struct drm_i915_error_state_buf *e)
 	return p;
 }
 
+/* single threaded page allocator with a reserved stash for emergencies */
+static void *__alloc_page(gfp_t gfp)
+{
+	return (void *)__get_free_page(gfp);
+}
+
+static void pool_fini(struct pagevec *pv)
+{
+	pagevec_release(pv);
+}
+
+static int pool_refill(struct pagevec *pv, gfp_t gfp)
+{
+	while (pagevec_space(pv)) {
+		struct page *p;
+
+		p = alloc_page(gfp);
+		if (!p)
+			return -ENOMEM;
+
+		pagevec_add(pv, p);
+	}
+
+	return 0;
+}
+
+static int pool_init(struct pagevec *pv, gfp_t gfp)
+{
+	int err;
+
+	pagevec_init(pv);
+
+	err = pool_refill(pv, gfp);
+	if (err)
+		pool_fini(pv);
+
+	return err;
+}
+
+static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
+{
+	void *ptr;
+
+	ptr = __alloc_page(gfp);
+	if (ptr)
+		return ptr;
+
+	if (!pagevec_count(pv))
+		return NULL;
+
+	return page_address(pv->pages[--pv->nr]);
+}
+
+static void pool_free(struct pagevec *pv, void *addr)
+{
+	struct page *p = virt_to_page(addr);
+
+	if (pagevec_space(pv)) {
+		pv->pages[pv->nr++] = p;
+		return;
+	}
+
+	__free_pages(p, 0);
+}
+
 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
 
 struct compress {
+	struct pagevec pool;
 	struct z_stream_s zstream;
 	void *tmp;
 };
 
 static bool compress_init(struct compress *c)
 {
-	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
+	struct z_stream_s *zstream = &c->zstream;
 
-	zstream->workspace =
-		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
-			GFP_ATOMIC | __GFP_NOWARN);
-	if (!zstream->workspace)
+	if (pool_init(&c->pool, ALLOW_FAIL))
 		return false;
 
-	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
-		kfree(zstream->workspace);
+	zstream->workspace =
+		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+			ALLOW_FAIL);
+	if (!zstream->workspace) {
+		pool_fini(&c->pool);
 		return false;
 	}
 
 	c->tmp = NULL;
 	if (i915_has_memcpy_from_wc())
-		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
 
 	return true;
 }
 
-static void *compress_next_page(struct drm_i915_error_object *dst)
+static bool compress_start(struct compress *c)
 {
-	unsigned long page;
+	struct z_stream_s *zstream = &c->zstream;
+	void *workspace = zstream->workspace;
+
+	memset(zstream, 0, sizeof(*zstream));
+	zstream->workspace = workspace;
+
+	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
+}
+
+static void *compress_next_page(struct compress *c,
+				struct drm_i915_error_object *dst)
+{
+	void *page;
 
 	if (dst->page_count >= dst->num_pages)
 		return ERR_PTR(-ENOSPC);
 
-	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+	page = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	return dst->pages[dst->page_count++] = (void *)page;
+	return dst->pages[dst->page_count++] = page;
 }
 
 static int compress_page(struct compress *c,
@@ -267,7 +328,7 @@  static int compress_page(struct compress *c,
 
 	do {
 		if (zstream->avail_out == 0) {
-			zstream->next_out = compress_next_page(dst);
+			zstream->next_out = compress_next_page(c, dst);
 			if (IS_ERR(zstream->next_out))
 				return PTR_ERR(zstream->next_out);
 
@@ -295,7 +356,7 @@  static int compress_flush(struct compress *c,
 	do {
 		switch (zlib_deflate(zstream, Z_FINISH)) {
 		case Z_OK: /* more space requested */
-			zstream->next_out = compress_next_page(dst);
+			zstream->next_out = compress_next_page(c, dst);
 			if (IS_ERR(zstream->next_out))
 				return PTR_ERR(zstream->next_out);
 
@@ -316,15 +377,17 @@  static int compress_flush(struct compress *c,
 	return 0;
 }
 
-static void compress_fini(struct compress *c,
-			  struct drm_i915_error_object *dst)
+static void compress_finish(struct compress *c)
 {
-	struct z_stream_s *zstream = &c->zstream;
+	zlib_deflateEnd(&c->zstream);
+}
 
-	zlib_deflateEnd(zstream);
-	kfree(zstream->workspace);
+static void compress_fini(struct compress *c)
+{
+	kfree(c->zstream.workspace);
 	if (c->tmp)
-		free_page((unsigned long)c->tmp);
+		pool_free(&c->pool, c->tmp);
+	pool_fini(&c->pool);
 }
 
 static void err_compression_marker(struct drm_i915_error_state_buf *m)
@@ -335,9 +398,15 @@  static void err_compression_marker(struct drm_i915_error_state_buf *m)
 #else
 
 struct compress {
+	struct pagevec pool;
 };
 
 static bool compress_init(struct compress *c)
+{
+	return pool_init(&c->pool, ALLOW_FAIL) == 0;
+}
+
+static bool compress_start(struct compress *c)
 {
 	return true;
 }
@@ -346,14 +415,12 @@  static int compress_page(struct compress *c,
 			 void *src,
 			 struct drm_i915_error_object *dst)
 {
-	unsigned long page;
 	void *ptr;
 
-	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
-	if (!page)
+	ptr = pool_alloc(&c->pool, ATOMIC_MAYFAIL);
+	if (!ptr)
 		return -ENOMEM;
 
-	ptr = (void *)page;
 	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
 		memcpy(ptr, src, PAGE_SIZE);
 	dst->pages[dst->page_count++] = ptr;
@@ -367,9 +434,13 @@  static int compress_flush(struct compress *c,
 	return 0;
 }
 
-static void compress_fini(struct compress *c,
-			  struct drm_i915_error_object *dst)
+static void compress_finish(struct compress *c)
+{
+}
+
+static void compress_fini(struct compress *c)
 {
+	pool_fini(&c->pool);
 }
 
 static void err_compression_marker(struct drm_i915_error_state_buf *m)
@@ -379,36 +450,6 @@  static void err_compression_marker(struct drm_i915_error_state_buf *m)
 
 #endif
 
-static void print_error_buffers(struct drm_i915_error_state_buf *m,
-				const char *name,
-				struct drm_i915_error_buffer *err,
-				int count)
-{
-	err_printf(m, "%s [%d]:\n", name, count);
-
-	while (count--) {
-		err_printf(m, "    %08x_%08x %8u %02x %02x",
-			   upper_32_bits(err->gtt_offset),
-			   lower_32_bits(err->gtt_offset),
-			   err->size,
-			   err->read_domains,
-			   err->write_domain);
-		err_puts(m, tiling_flag(err->tiling));
-		err_puts(m, dirty_flag(err->dirty));
-		err_puts(m, purgeable_flag(err->purgeable));
-		err_puts(m, err->userptr ? " userptr" : "");
-		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
-
-		if (err->name)
-			err_printf(m, " (name: %d)", err->name);
-		if (err->fence_reg != I915_FENCE_REG_NONE)
-			err_printf(m, " (fence: %d)", err->fence_reg);
-
-		err_puts(m, "\n");
-		err++;
-	}
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 const struct drm_i915_error_engine *ee)
 {
@@ -734,33 +775,6 @@  static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 			error_print_engine(m, &error->engine[i], error->epoch);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
-		char buf[128];
-		int len, first = 1;
-
-		if (!error->active_vm[i])
-			break;
-
-		len = scnprintf(buf, sizeof(buf), "Active (");
-		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
-			if (error->engine[j].vm != error->active_vm[i])
-				continue;
-
-			len += scnprintf(buf + len, sizeof(buf), "%s%s",
-					 first ? "" : ", ",
-					 m->i915->engine[j]->name);
-			first = 0;
-		}
-		scnprintf(buf + len, sizeof(buf), ")");
-		print_error_buffers(m, buf,
-				    error->active_bo[i],
-				    error->active_bo_count[i]);
-	}
-
-	print_error_buffers(m, "Pinned (global)",
-			    error->pinned_bo,
-			    error->pinned_bo_count);
-
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		const struct drm_i915_error_engine *ee = &error->engine[i];
 
@@ -974,10 +988,6 @@  void __i915_gpu_state_free(struct kref *error_ref)
 		kfree(ee->requests);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
-		kfree(error->active_bo[i]);
-	kfree(error->pinned_bo);
-
 	kfree(error->overlay);
 	kfree(error->display);
 
@@ -990,12 +1000,12 @@  void __i915_gpu_state_free(struct kref *error_ref)
 
 static struct drm_i915_error_object *
 i915_error_object_create(struct drm_i915_private *i915,
-			 struct i915_vma *vma)
+			 struct i915_vma *vma,
+			 struct compress *compress)
 {
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	const u64 slot = ggtt->error_capture.start;
 	struct drm_i915_error_object *dst;
-	struct compress compress;
 	unsigned long num_pages;
 	struct sgt_iter iter;
 	dma_addr_t dma;
@@ -1006,22 +1016,21 @@  i915_error_object_create(struct drm_i915_private *i915,
 
 	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
 	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
-	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
-		      GFP_ATOMIC | __GFP_NOWARN);
+	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ATOMIC_MAYFAIL);
 	if (!dst)
 		return NULL;
 
+	if (!compress_start(compress)) {
+		kfree(dst);
+		return NULL;
+	}
+
 	dst->gtt_offset = vma->node.start;
 	dst->gtt_size = vma->node.size;
 	dst->num_pages = num_pages;
 	dst->page_count = 0;
 	dst->unused = 0;
 
-	if (!compress_init(&compress)) {
-		kfree(dst);
-		return NULL;
-	}
-
 	ret = -EINVAL;
 	for_each_sgt_dma(dma, iter, vma->pages) {
 		void __iomem *s;
@@ -1029,69 +1038,23 @@  i915_error_object_create(struct drm_i915_private *i915,
 		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
 
 		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
-		ret = compress_page(&compress, (void  __force *)s, dst);
+		ret = compress_page(compress, (void  __force *)s, dst);
 		io_mapping_unmap_atomic(s);
 		if (ret)
 			break;
 	}
 
-	if (ret || compress_flush(&compress, dst)) {
+	if (ret || compress_flush(compress, dst)) {
 		while (dst->page_count--)
-			free_page((unsigned long)dst->pages[dst->page_count]);
+			pool_free(&compress->pool, dst->pages[dst->page_count]);
 		kfree(dst);
 		dst = NULL;
 	}
+	compress_finish(compress);
 
-	compress_fini(&compress, dst);
 	return dst;
 }
 
-static void capture_bo(struct drm_i915_error_buffer *err,
-		       struct i915_vma *vma)
-{
-	struct drm_i915_gem_object *obj = vma->obj;
-
-	err->size = obj->base.size;
-	err->name = obj->base.name;
-
-	err->gtt_offset = vma->node.start;
-	err->read_domains = obj->read_domains;
-	err->write_domain = obj->write_domain;
-	err->fence_reg = vma->fence ? vma->fence->id : -1;
-	err->tiling = i915_gem_object_get_tiling(obj);
-	err->dirty = obj->mm.dirty;
-	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
-	err->userptr = obj->userptr.mm != NULL;
-	err->cache_level = obj->cache_level;
-}
-
-static u32 capture_error_bo(struct drm_i915_error_buffer *err,
-			    int count, struct list_head *head,
-			    unsigned int flags)
-#define ACTIVE_ONLY BIT(0)
-#define PINNED_ONLY BIT(1)
-{
-	struct i915_vma *vma;
-	int i = 0;
-
-	list_for_each_entry(vma, head, vm_link) {
-		if (!vma->obj)
-			continue;
-
-		if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
-			continue;
-
-		if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
-			continue;
-
-		capture_bo(err++, vma);
-		if (++i == count)
-			break;
-	}
-
-	return i;
-}
-
 /*
  * Generate a semi-unique error code. The code is not meant to have meaning, The
  * code's only purpose is to try to prevent false duplicated bug reports by
@@ -1281,7 +1244,7 @@  static void engine_record_requests(struct intel_engine_cs *engine,
 	if (!count)
 		return;
 
-	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
+	ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
 	if (!ee->requests)
 		return;
 
@@ -1349,8 +1312,10 @@  static void record_context(struct drm_i915_error_context *e,
 	e->active = atomic_read(&ctx->active_count);
 }
 
-static void request_record_user_bo(struct i915_request *request,
-				   struct drm_i915_error_engine *ee)
+static void
+request_record_user_bo(struct i915_request *request,
+		       struct drm_i915_error_engine *ee,
+		       struct compress *compress)
 {
 	struct i915_capture_list *c;
 	struct drm_i915_error_object **bo;
@@ -1362,18 +1327,20 @@  static void request_record_user_bo(struct i915_request *request,
 	if (!max)
 		return;
 
-	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
 	if (!bo) {
 		/* If we can't capture everything, try to capture something. */
 		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
-		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
 	}
 	if (!bo)
 		return;
 
 	count = 0;
 	for (c = request->capture_list; c; c = c->next) {
-		bo[count] = i915_error_object_create(request->i915, c->vma);
+		bo[count] = i915_error_object_create(request->i915,
+						     c->vma,
+						     compress);
 		if (!bo[count])
 			break;
 		if (++count == max)
@@ -1386,7 +1353,8 @@  static void request_record_user_bo(struct i915_request *request,
 
 static struct drm_i915_error_object *
 capture_object(struct drm_i915_private *dev_priv,
-	       struct drm_i915_gem_object *obj)
+	       struct drm_i915_gem_object *obj,
+	       struct compress *compress)
 {
 	if (obj && i915_gem_object_has_pages(obj)) {
 		struct i915_vma fake = {
@@ -1396,13 +1364,14 @@  capture_object(struct drm_i915_private *dev_priv,
 			.obj = obj,
 		};
 
-		return i915_error_object_create(dev_priv, &fake);
+		return i915_error_object_create(dev_priv, &fake, compress);
 	} else {
 		return NULL;
 	}
 }
 
-static void gem_record_rings(struct i915_gpu_state *error)
+static void
+gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
 {
 	struct drm_i915_private *i915 = error->i915;
 	int i;
@@ -1420,6 +1389,9 @@  static void gem_record_rings(struct i915_gpu_state *error)
 
 		ee->engine_id = i;
 
+		/* Refill our page pool before entering atomic section */
+		pool_refill(&compress->pool, ALLOW_FAIL);
+
 		error_record_engine_registers(error, engine, ee);
 		error_record_engine_execlists(engine, ee);
 
@@ -1429,8 +1401,6 @@  static void gem_record_rings(struct i915_gpu_state *error)
 			struct i915_gem_context *ctx = request->gem_context;
 			struct intel_ring *ring = request->ring;
 
-			ee->vm = request->hw_context->vm;
-
 			record_context(&ee->context, ctx);
 
 			/* We need to copy these to an anonymous buffer
@@ -1438,17 +1408,21 @@  static void gem_record_rings(struct i915_gpu_state *error)
 			 * by userspace.
 			 */
 			ee->batchbuffer =
-				i915_error_object_create(i915, request->batch);
+				i915_error_object_create(i915,
+							 request->batch,
+							 compress);
 
 			if (HAS_BROKEN_CS_TLB(i915))
 				ee->wa_batchbuffer =
 				  i915_error_object_create(i915,
-							   engine->gt->scratch);
-			request_record_user_bo(request, ee);
+							   engine->gt->scratch,
+							   compress);
+			request_record_user_bo(request, ee, compress);
 
 			ee->ctx =
 				i915_error_object_create(i915,
-							 request->hw_context->state);
+							 request->hw_context->state,
+							 compress);
 
 			error->simulated |=
 				i915_gem_context_no_error_capture(ctx);
@@ -1460,7 +1434,9 @@  static void gem_record_rings(struct i915_gpu_state *error)
 			ee->cpu_ring_head = ring->head;
 			ee->cpu_ring_tail = ring->tail;
 			ee->ringbuffer =
-				i915_error_object_create(i915, ring->vma);
+				i915_error_object_create(i915,
+							 ring->vma,
+							 compress);
 
 			engine_record_requests(engine, request, ee);
 		}
@@ -1468,89 +1444,21 @@  static void gem_record_rings(struct i915_gpu_state *error)
 
 		ee->hws_page =
 			i915_error_object_create(i915,
-						 engine->status_page.vma);
-
-		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
-
-		ee->default_state = capture_object(i915, engine->default_state);
-	}
-}
-
-static void gem_capture_vm(struct i915_gpu_state *error,
-			   struct i915_address_space *vm,
-			   int idx)
-{
-	struct drm_i915_error_buffer *active_bo;
-	struct i915_vma *vma;
-	int count;
-
-	count = 0;
-	list_for_each_entry(vma, &vm->bound_list, vm_link)
-		if (i915_vma_is_active(vma))
-			count++;
-
-	active_bo = NULL;
-	if (count)
-		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
-	if (active_bo)
-		count = capture_error_bo(active_bo,
-					 count, &vm->bound_list,
-					 ACTIVE_ONLY);
-	else
-		count = 0;
+						 engine->status_page.vma,
+						 compress);
 
-	error->active_vm[idx] = vm;
-	error->active_bo[idx] = active_bo;
-	error->active_bo_count[idx] = count;
-}
-
-static void capture_active_buffers(struct i915_gpu_state *error)
-{
-	int cnt = 0, i, j;
-
-	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
-	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
-	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
-
-	/* Scan each engine looking for unique active contexts/vm */
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		struct drm_i915_error_engine *ee = &error->engine[i];
-		bool found;
-
-		if (!ee->vm)
-			continue;
+		ee->wa_ctx =
+			i915_error_object_create(i915,
+						 engine->wa_ctx.vma,
+						 compress);
 
-		found = false;
-		for (j = 0; j < i && !found; j++)
-			found = error->engine[j].vm == ee->vm;
-		if (!found)
-			gem_capture_vm(error, ee->vm, cnt++);
+		ee->default_state =
+			capture_object(i915, engine->default_state, compress);
 	}
 }
 
-static void capture_pinned_buffers(struct i915_gpu_state *error)
-{
-	struct i915_address_space *vm = &error->i915->ggtt.vm;
-	struct drm_i915_error_buffer *bo;
-	struct i915_vma *vma;
-	int count;
-
-	count = 0;
-	list_for_each_entry(vma, &vm->bound_list, vm_link)
-		count++;
-
-	bo = NULL;
-	if (count)
-		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
-	if (!bo)
-		return;
-
-	error->pinned_bo_count =
-		capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
-	error->pinned_bo = bo;
-}
-
-static void capture_uc_state(struct i915_gpu_state *error)
+static void
+capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
 {
 	struct drm_i915_private *i915 = error->i915;
 	struct i915_error_uc *error_uc = &error->uc;
@@ -1567,9 +1475,11 @@  static void capture_uc_state(struct i915_gpu_state *error)
 	 * As modparams are generally accesible from the userspace make
 	 * explicit copies of the firmware paths.
 	 */
-	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, GFP_ATOMIC);
-	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, GFP_ATOMIC);
-	error_uc->guc_log = i915_error_object_create(i915, uc->guc.log.vma);
+	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
+	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
+	error_uc->guc_log = i915_error_object_create(i915,
+						     uc->guc.log.vma,
+						     compress);
 }
 
 /* Capture all registers which don't fit into another category. */
@@ -1753,56 +1663,53 @@  static void capture_finish(struct i915_gpu_state *error)
 	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
 }
 
-static int capture(void *data)
-{
-	struct i915_gpu_state *error = data;
-
-	error->time = ktime_get_real();
-	error->boottime = ktime_get_boottime();
-	error->uptime = ktime_sub(ktime_get(),
-				  error->i915->gt.last_init_time);
-	error->capture = jiffies;
-
-	capture_params(error);
-	capture_gen_state(error);
-	capture_uc_state(error);
-	capture_reg_state(error);
-	gem_record_fences(error);
-	gem_record_rings(error);
-	capture_active_buffers(error);
-	capture_pinned_buffers(error);
-
-	error->overlay = intel_overlay_capture_error_state(error->i915);
-	error->display = intel_display_capture_error_state(error->i915);
-
-	error->epoch = capture_find_epoch(error);
-
-	capture_finish(error);
-	return 0;
-}
-
 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
 
 struct i915_gpu_state *
 i915_capture_gpu_state(struct drm_i915_private *i915)
 {
 	struct i915_gpu_state *error;
+	struct compress compress;
 
 	/* Check if GPU capture has been disabled */
 	error = READ_ONCE(i915->gpu_error.first_error);
 	if (IS_ERR(error))
 		return error;
 
-	error = kzalloc(sizeof(*error), GFP_ATOMIC);
+	error = kzalloc(sizeof(*error), ALLOW_FAIL);
 	if (!error) {
 		i915_disable_error_state(i915, -ENOMEM);
 		return ERR_PTR(-ENOMEM);
 	}
 
+	if (!compress_init(&compress)) {
+		kfree(error);
+		i915_disable_error_state(i915, -ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	kref_init(&error->ref);
 	error->i915 = i915;
 
-	stop_machine(capture, error, NULL);
+	error->time = ktime_get_real();
+	error->boottime = ktime_get_boottime();
+	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
+	error->capture = jiffies;
+
+	capture_params(error);
+	capture_gen_state(error);
+	capture_uc_state(error, &compress);
+	capture_reg_state(error);
+	gem_record_fences(error);
+	gem_record_rings(error, &compress);
+
+	error->overlay = intel_overlay_capture_error_state(i915);
+	error->display = intel_display_capture_error_state(i915);
+
+	error->epoch = capture_find_epoch(error);
+
+	capture_finish(error);
+	compress_fini(&compress);
 
 	return error;
 }
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 85f06bc5da05..a24c35107d16 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -85,7 +85,6 @@  struct i915_gpu_state {
 		/* Software tracked state */
 		bool idle;
 		unsigned long hangcheck_timestamp;
-		struct i915_address_space *vm;
 		int num_requests;
 		u32 reset_count;
 
@@ -161,22 +160,6 @@  struct i915_gpu_state {
 		} vm_info;
 	} engine[I915_NUM_ENGINES];
 
-	struct drm_i915_error_buffer {
-		u32 size;
-		u32 name;
-		u64 gtt_offset;
-		u32 read_domains;
-		u32 write_domain;
-		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
-		u32 tiling:2;
-		u32 dirty:1;
-		u32 purgeable:1;
-		u32 userptr:1;
-		u32 cache_level:3;
-	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
-	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
-	struct i915_address_space *active_vm[I915_NUM_ENGINES];
-
 	struct scatterlist *sgl, *fit;
 };