From patchwork Sun Aug 26 21:30:15 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chris Wilson X-Patchwork-Id: 10576329 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 09A125A4 for ; Sun, 26 Aug 2018 21:31:19 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id EFFF3297FC for ; Sun, 26 Aug 2018 21:31:18 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id E359329808; Sun, 26 Aug 2018 21:31:18 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 8E35E297FC for ; Sun, 26 Aug 2018 21:31:18 +0000 (UTC) Received: from gabe.freedesktop.org (localhost [127.0.0.1]) by gabe.freedesktop.org (Postfix) with ESMTP id BB1706E14E; Sun, 26 Aug 2018 21:31:17 +0000 (UTC) X-Original-To: intel-gfx@lists.freedesktop.org Delivered-To: intel-gfx@lists.freedesktop.org Received: from fireflyinternet.com (mail.fireflyinternet.com [109.228.58.192]) by gabe.freedesktop.org (Postfix) with ESMTPS id E54756E14D for ; Sun, 26 Aug 2018 21:31:13 +0000 (UTC) X-Default-Received-SPF: pass (skip=forwardok (res=PASS)) x-ip-name=78.156.65.138; Received: from haswell.alporthouse.com (unverified [78.156.65.138]) by fireflyinternet.com (Firefly Internet (M1)) with ESMTP id 13285914-1500050 for multiple; Sun, 26 Aug 2018 22:30:19 +0100 From: Chris Wilson To: intel-gfx@lists.freedesktop.org Date: Sun, 26 Aug 2018 22:30:15 +0100 Message-Id: <20180826213017.11601-1-chris@chris-wilson.co.uk> X-Mailer: git-send-email 2.18.0 Subject: [Intel-gfx] [PATCH 1/3] drm/i915: Handle incomplete Z_FINISH for compressed error states X-BeenThere: intel-gfx@lists.freedesktop.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Intel graphics driver community testing & development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: stable@vger.kernel.org MIME-Version: 1.0 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" X-Virus-Scanned: ClamAV using ClamSMTP The final call to zlib_deflate(Z_FINISH) may require more output space to be allocated and so needs to re-invoked. Failure to do so in the current code leads to incomplete zlib streams (albeit intact due to the use of Z_SYNC_FLUSH) resulting in the occasional short object capture. Testcase: igt/i915-error-capture.js Fixes: 0a97015d45ee ("drm/i915: Compress GPU objects in error state") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: # v4.10+ --- drivers/gpu/drm/i915/i915_gpu_error.c | 60 +++++++++++++++++++++------ 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f7f2aa71d8d9..2eb26ea38d7c 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -237,6 +237,7 @@ static int compress_page(struct compress *c, struct drm_i915_error_object *dst) { struct z_stream_s *zstream = &c->zstream; + int flush = Z_NO_FLUSH; zstream->next_in = src; if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) @@ -257,8 +258,11 @@ static int compress_page(struct compress *c, zstream->avail_out = PAGE_SIZE; } - if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK) + if (zlib_deflate(zstream, flush) != Z_OK) return -EIO; + + if (zstream->avail_out) + flush = Z_SYNC_FLUSH; } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -268,19 +272,43 @@ static int compress_page(struct compress *c, return 0; } -static void compress_fini(struct compress *c, +static int compress_flush(struct compress *c, struct drm_i915_error_object *dst) { struct z_stream_s *zstream = &c->zstream; + unsigned long page; - if (dst) { - zlib_deflate(zstream, Z_FINISH); - dst->unused = zstream->avail_out; - } + do { + switch (zlib_deflate(zstream, Z_FINISH)) { + case Z_OK: /* more space requested */ + page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) + return -ENOMEM; + + dst->pages[dst->page_count++] = (void *)page; + zstream->next_out = (void *)page; + zstream->avail_out = PAGE_SIZE; + break; + case Z_STREAM_END: + goto end; + default: /* any error */ + return -EIO; + } + } while (1); + +end: + memset(zstream->next_out, 0, zstream->avail_out); + dst->unused = zstream->avail_out; + return 0; +} + +static void compress_fini(struct compress *c, + struct drm_i915_error_object *dst) +{ + struct z_stream_s *zstream = &c->zstream; zlib_deflateEnd(zstream); kfree(zstream->workspace); - if (c->tmp) free_page((unsigned long)c->tmp); } @@ -319,6 +347,12 @@ static int compress_page(struct compress *c, return 0; } +static int compress_flush(struct compress *c, + struct drm_i915_error_object *dst) +{ + return 0; +} + static void compress_fini(struct compress *c, struct drm_i915_error_object *dst) { @@ -951,15 +985,15 @@ i915_error_object_create(struct drm_i915_private *i915, if (ret) goto unwind; } - goto out; + if (compress_flush(&compress, dst)) { unwind: - while (dst->page_count--) - free_page((unsigned long)dst->pages[dst->page_count]); - kfree(dst); - dst = NULL; + while (dst->page_count--) + free_page((unsigned long)dst->pages[dst->page_count]); + kfree(dst); + dst = NULL; + } -out: compress_fini(&compress, dst); ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); return dst; From patchwork Sun Aug 26 21:30:16 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chris Wilson X-Patchwork-Id: 10576325 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 71DBE5A4 for ; Sun, 26 Aug 2018 21:30:29 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 596A8297FC for ; Sun, 26 Aug 2018 21:30:29 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 4DA8829808; Sun, 26 Aug 2018 21:30:29 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 8872B297FC for ; Sun, 26 Aug 2018 21:30:27 +0000 (UTC) Received: from gabe.freedesktop.org (localhost [127.0.0.1]) by gabe.freedesktop.org (Postfix) with ESMTP id AB9906E146; Sun, 26 Aug 2018 21:30:26 +0000 (UTC) X-Original-To: intel-gfx@lists.freedesktop.org Delivered-To: intel-gfx@lists.freedesktop.org Received: from fireflyinternet.com (mail.fireflyinternet.com [109.228.58.192]) by gabe.freedesktop.org (Postfix) with ESMTPS id 301936E146 for ; Sun, 26 Aug 2018 21:30:25 +0000 (UTC) X-Default-Received-SPF: pass (skip=forwardok (res=PASS)) x-ip-name=78.156.65.138; Received: from haswell.alporthouse.com (unverified [78.156.65.138]) by fireflyinternet.com (Firefly Internet (M1)) with ESMTP id 13285915-1500050 for multiple; Sun, 26 Aug 2018 22:30:20 +0100 From: Chris Wilson To: intel-gfx@lists.freedesktop.org Date: Sun, 26 Aug 2018 22:30:16 +0100 Message-Id: <20180826213017.11601-2-chris@chris-wilson.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180826213017.11601-1-chris@chris-wilson.co.uk> References: <20180826213017.11601-1-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH 2/3] drm/i915: Clear the error PTE just once on finish X-BeenThere: intel-gfx@lists.freedesktop.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Intel graphics driver community testing & development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" X-Virus-Scanned: ClamAV using ClamSMTP We do not need to continually clear our dedicated PTE for error capture as it will be updated and invalidated to the next object. Only at the end do we wish to be sure that the PTE doesn't point back to any buffer. Signed-off-by: Chris Wilson --- drivers/gpu/drm/i915/i915_gpu_error.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 2eb26ea38d7c..3bd4f957abb9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -995,7 +995,6 @@ i915_error_object_create(struct drm_i915_private *i915, } compress_fini(&compress, dst); - ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); return dst; } @@ -1775,6 +1774,14 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error) return epoch; } +static void capture_finish(struct i915_gpu_state *error) +{ + struct i915_ggtt *ggtt = &error->i915->ggtt; + const u64 slot = ggtt->error_capture.start; + + ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); +} + static int capture(void *data) { struct i915_gpu_state *error = data; @@ -1799,6 +1806,7 @@ static int capture(void *data) error->epoch = capture_find_epoch(error); + capture_finish(error); return 0; } From patchwork Sun Aug 26 21:30:17 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chris Wilson X-Patchwork-Id: 10576327 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 310F75A4 for ; Sun, 26 Aug 2018 21:31:17 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1F80A297FC for ; Sun, 26 Aug 2018 21:31:17 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 1295029808; Sun, 26 Aug 2018 21:31:17 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id C0BC9297FC for ; Sun, 26 Aug 2018 21:31:15 +0000 (UTC) Received: from gabe.freedesktop.org (localhost [127.0.0.1]) by gabe.freedesktop.org (Postfix) with ESMTP id 212006E14D; Sun, 26 Aug 2018 21:31:15 +0000 (UTC) X-Original-To: intel-gfx@lists.freedesktop.org Delivered-To: intel-gfx@lists.freedesktop.org Received: from fireflyinternet.com (mail.fireflyinternet.com [109.228.58.192]) by gabe.freedesktop.org (Postfix) with ESMTPS id EE7C06E14E for ; Sun, 26 Aug 2018 21:31:12 +0000 (UTC) X-Default-Received-SPF: pass (skip=forwardok (res=PASS)) x-ip-name=78.156.65.138; Received: from haswell.alporthouse.com (unverified [78.156.65.138]) by fireflyinternet.com (Firefly Internet (M1)) with ESMTP id 13285916-1500050 for multiple; Sun, 26 Aug 2018 22:30:20 +0100 From: Chris Wilson To: intel-gfx@lists.freedesktop.org Date: Sun, 26 Aug 2018 22:30:17 +0100 Message-Id: <20180826213017.11601-3-chris@chris-wilson.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180826213017.11601-1-chris@chris-wilson.co.uk> References: <20180826213017.11601-1-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH 3/3] drm/i915: Cache the error string X-BeenThere: intel-gfx@lists.freedesktop.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Intel graphics driver community testing & development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" X-Virus-Scanned: ClamAV using ClamSMTP Currently, we convert the error state into a string every time we read from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to window the string and only capture the portion that is being read, but that means that we must always convert up to the window to find the start. For a very large error state bordering on EXEC_OBJECT_CAPTURE abuse, this is noticeable as it degrades to O(N^2)! As we do not have a convenient hook for sysfs open(), and we would like to keep the lazy conversion into a string, do the conversion of the whole string on the first read and keep the string until the error state is freed. v2: Don't double advance simple_read_from_buffer v3: Due to extreme pain of lack of vrealloc, use a scatterlist v4: Keep the forward iterator loosely cached Reported-by: Jason Ekstrand Testcase: igt/gem_exec_capture/many* Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Jason Ekstrand --- drivers/gpu/drm/i915/i915_debugfs.c | 32 +- drivers/gpu/drm/i915/i915_gpu_error.c | 402 +++++++++++++++----------- drivers/gpu/drm/i915/i915_gpu_error.h | 28 +- drivers/gpu/drm/i915/i915_sysfs.c | 27 +- 4 files changed, 273 insertions(+), 216 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a5265c236a33..34dbdceb2e39 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -943,30 +943,28 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data) static ssize_t gpu_state_read(struct file *file, char __user *ubuf, size_t count, loff_t *pos) { - struct i915_gpu_state *error = file->private_data; - struct drm_i915_error_state_buf str; + struct i915_gpu_state *error; ssize_t ret; - loff_t tmp; + void *buf; + error = file->private_data; if (!error) return 0; - ret = i915_error_state_buf_init(&str, error->i915, count, *pos); - if (ret) - return ret; - - ret = i915_error_state_to_str(&str, error); - if (ret) - goto out; + /* Bounce buffer required because of kernfs __user API convenience. */ + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; - tmp = 0; - ret = simple_read_from_buffer(ubuf, count, &tmp, str.buf, str.bytes); - if (ret < 0) - goto out; + ret = i915_gpu_state_copy_to_buffer(error, buf, *pos, count); + if (ret > 0) { + if (!copy_to_user(ubuf, buf, ret)) + *pos += ret; + else + ret = -EFAULT; + } - *pos = str.start + ret; -out: - i915_error_state_buf_release(&str); + kfree(buf); return ret; } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 3bd4f957abb9..36476725b13c 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -28,6 +28,8 @@ */ #include +#include +#include #include #include #include @@ -77,112 +79,110 @@ static const char *purgeable_flag(int purgeable) return purgeable ? " purgeable" : ""; } -static bool __i915_error_ok(struct drm_i915_error_state_buf *e) +static void __sg_set_buf(struct scatterlist *sg, + void *addr, unsigned int len, loff_t it) { - - if (!e->err && WARN(e->bytes > (e->size - 1), "overflow")) { - e->err = -ENOSPC; - return false; - } - - if (e->bytes == e->size - 1 || e->err) - return false; - - return true; + sg->page_link = (unsigned long)virt_to_page(addr); + sg->offset = offset_in_page(addr); + sg->length = len; + sg->dma_address = it; } -static bool __i915_error_seek(struct drm_i915_error_state_buf *e, - unsigned len) +static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) { - if (e->pos + len <= e->start) { - e->pos += len; + if (!len) return false; - } - /* First vsnprintf needs to fit in its entirety for memmove */ - if (len >= e->size) { - e->err = -EIO; - return false; - } + if (e->bytes + len + 1 > e->size) { + if (e->bytes) { + __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); + e->iter += e->bytes; + e->buf = NULL; + e->bytes = 0; + } - return true; -} + if (e->cur == e->end) { + struct scatterlist *sgl; -static void __i915_error_advance(struct drm_i915_error_state_buf *e, - unsigned len) -{ - /* If this is first printf in this window, adjust it so that - * start position matches start of the buffer - */ + sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); + if (!sgl) { + e->err = -ENOMEM; + return false; + } - if (e->pos < e->start) { - const size_t off = e->start - e->pos; + if (e->cur) { + e->cur->offset = 0; + e->cur->length = 0; + e->cur->page_link = + (unsigned long)sgl | SG_CHAIN; + } else { + e->sgl = sgl; + } - /* Should not happen but be paranoid */ - if (off > len || e->bytes) { - e->err = -EIO; - return; + e->cur = sgl; + e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; } - memmove(e->buf, e->buf + off, len - off); - e->bytes = len - off; - e->pos = e->start; - return; + e->size = ALIGN(len + 1, SZ_64K); + e->buf = kmalloc(e->size, + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!e->buf) { + e->size = PAGE_ALIGN(len + 1); + e->buf = kmalloc(e->size, GFP_KERNEL); + } + if (!e->buf) { + e->err = -ENOMEM; + return false; + } } - e->bytes += len; - e->pos += len; + return true; } __printf(2, 0) static void i915_error_vprintf(struct drm_i915_error_state_buf *e, - const char *f, va_list args) + const char *fmt, va_list args) { - unsigned len; + va_list ap; + int len; - if (!__i915_error_ok(e)) + if (e->err) return; - /* Seek the first printf which is hits start position */ - if (e->pos < e->start) { - va_list tmp; - - va_copy(tmp, args); - len = vsnprintf(NULL, 0, f, tmp); - va_end(tmp); - - if (!__i915_error_seek(e, len)) - return; + va_copy(ap, args); + len = vsnprintf(NULL, 0, fmt, ap); + va_end(ap); + if (len <= 0) { + e->err = len; + return; } - len = vsnprintf(e->buf + e->bytes, e->size - e->bytes, f, args); - if (len >= e->size - e->bytes) - len = e->size - e->bytes - 1; + if (!__i915_error_grow(e, len)) + return; - __i915_error_advance(e, len); + GEM_BUG_ON(e->bytes >= e->size); + len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); + if (len < 0) { + e->err = len; + return; + } + e->bytes += len; } -static void i915_error_puts(struct drm_i915_error_state_buf *e, - const char *str) +static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) { unsigned len; - if (!__i915_error_ok(e)) + if (e->err || !str) return; len = strlen(str); + if (!__i915_error_grow(e, len)) + return; - /* Seek the first printf which is hits start position */ - if (e->pos < e->start) { - if (!__i915_error_seek(e, len)) - return; - } - - if (len >= e->size - e->bytes) - len = e->size - e->bytes - 1; + GEM_BUG_ON(e->bytes + len > e->size); memcpy(e->buf + e->bytes, str, len); - - __i915_error_advance(e, len); + e->bytes += len; } #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) @@ -263,6 +263,8 @@ static int compress_page(struct compress *c, if (zstream->avail_out) flush = Z_SYNC_FLUSH; + + touch_nmi_watchdog(); } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -631,33 +633,50 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); } -int i915_error_state_to_str(struct drm_i915_error_state_buf *m, - const struct i915_gpu_state *error) +static void err_free_sgl(struct scatterlist *sgl) { - struct drm_i915_private *dev_priv = m->i915; + while (sgl) { + struct scatterlist *sg; + + for (sg = sgl; !sg_is_chain(sg); sg++) { + kfree(sg_virt(sg)); + if (sg_is_last(sg)) + break; + } + + sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); + free_page((unsigned long)sgl); + sgl = sg; + } +} + +static int err_print_to_sgl(struct i915_gpu_state *error) +{ + struct drm_i915_error_state_buf m; struct drm_i915_error_object *obj; struct timespec64 ts; int i, j; - if (!error) { - err_printf(m, "No error state collected\n"); + if (READ_ONCE(error->sgl)) return 0; - } + + memset(&m, 0, sizeof(m)); + m.i915 = error->i915; if (*error->error_msg) - err_printf(m, "%s\n", error->error_msg); - err_printf(m, "Kernel: " UTS_RELEASE "\n"); + err_printf(&m, "%s\n", error->error_msg); + err_printf(&m, "Kernel: " UTS_RELEASE "\n"); ts = ktime_to_timespec64(error->time); - err_printf(m, "Time: %lld s %ld us\n", + err_printf(&m, "Time: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); ts = ktime_to_timespec64(error->boottime); - err_printf(m, "Boottime: %lld s %ld us\n", + err_printf(&m, "Boottime: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); ts = ktime_to_timespec64(error->uptime); - err_printf(m, "Uptime: %lld s %ld us\n", + err_printf(&m, "Uptime: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); - err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); - err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", + err_printf(&m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); + err_printf(&m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", error->capture, jiffies_to_msecs(jiffies - error->capture), jiffies_to_msecs(error->capture - error->epoch)); @@ -665,63 +684,63 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, for (i = 0; i < ARRAY_SIZE(error->engine); i++) { if (error->engine[i].hangcheck_stalled && error->engine[i].context.pid) { - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", - engine_name(m->i915, i), + err_printf(&m, "Active process (on ring %s): %s [%d], score %d%s\n", + engine_name(m.i915, i), error->engine[i].context.comm, error->engine[i].context.pid, error->engine[i].context.ban_score, bannable(&error->engine[i].context)); } } - err_printf(m, "Reset count: %u\n", error->reset_count); - err_printf(m, "Suspend count: %u\n", error->suspend_count); - err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); - err_print_pciid(m, error->i915); + err_printf(&m, "Reset count: %u\n", error->reset_count); + err_printf(&m, "Suspend count: %u\n", error->suspend_count); + err_printf(&m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); + err_print_pciid(&m, m.i915); - err_printf(m, "IOMMU enabled?: %d\n", error->iommu); + err_printf(&m, "IOMMU enabled?: %d\n", error->iommu); - if (HAS_CSR(dev_priv)) { - struct intel_csr *csr = &dev_priv->csr; + if (HAS_CSR(m.i915)) { + struct intel_csr *csr = &m.i915->csr; - err_printf(m, "DMC loaded: %s\n", + err_printf(&m, "DMC loaded: %s\n", yesno(csr->dmc_payload != NULL)); - err_printf(m, "DMC fw version: %d.%d\n", + err_printf(&m, "DMC fw version: %d.%d\n", CSR_VERSION_MAJOR(csr->version), CSR_VERSION_MINOR(csr->version)); } - err_printf(m, "GT awake: %s\n", yesno(error->awake)); - err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); - err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); - err_printf(m, "EIR: 0x%08x\n", error->eir); - err_printf(m, "IER: 0x%08x\n", error->ier); + err_printf(&m, "GT awake: %s\n", yesno(error->awake)); + err_printf(&m, "RPM wakelock: %s\n", yesno(error->wakelock)); + err_printf(&m, "PM suspended: %s\n", yesno(error->suspended)); + err_printf(&m, "EIR: 0x%08x\n", error->eir); + err_printf(&m, "IER: 0x%08x\n", error->ier); for (i = 0; i < error->ngtier; i++) - err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); - err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); - err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); - err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); - err_printf(m, "CCID: 0x%08x\n", error->ccid); - err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); + err_printf(&m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); + err_printf(&m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); + err_printf(&m, "FORCEWAKE: 0x%08x\n", error->forcewake); + err_printf(&m, "DERRMR: 0x%08x\n", error->derrmr); + err_printf(&m, "CCID: 0x%08x\n", error->ccid); + err_printf(&m, "Missed interrupts: 0x%08lx\n", m.i915->gpu_error.missed_irq_rings); for (i = 0; i < error->nfence; i++) - err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); + err_printf(&m, " fence[%d] = %08llx\n", i, error->fence[i]); - if (INTEL_GEN(dev_priv) >= 6) { - err_printf(m, "ERROR: 0x%08x\n", error->error); + if (INTEL_GEN(m.i915) >= 6) { + err_printf(&m, "ERROR: 0x%08x\n", error->error); - if (INTEL_GEN(dev_priv) >= 8) - err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", + if (INTEL_GEN(m.i915) >= 8) + err_printf(&m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", error->fault_data1, error->fault_data0); - err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); + err_printf(&m, "DONE_REG: 0x%08x\n", error->done_reg); } - if (IS_GEN7(dev_priv)) - err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); + if (IS_GEN7(m.i915)) + err_printf(&m, "ERR_INT: 0x%08x\n", error->err_int); for (i = 0; i < ARRAY_SIZE(error->engine); i++) { if (error->engine[i].engine_id != -1) - error_print_engine(m, &error->engine[i], error->epoch); + error_print_engine(&m, &error->engine[i], error->epoch); } for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { @@ -738,16 +757,16 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, len += scnprintf(buf + len, sizeof(buf), "%s%s", first ? "" : ", ", - dev_priv->engine[j]->name); + m.i915->engine[j]->name); first = 0; } scnprintf(buf + len, sizeof(buf), ")"); - print_error_buffers(m, buf, + print_error_buffers(&m, buf, error->active_bo[i], error->active_bo_count[i]); } - print_error_buffers(m, "Pinned (global)", + print_error_buffers(&m, "Pinned (global)", error->pinned_bo, error->pinned_bo_count); @@ -756,115 +775,163 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, obj = ee->batchbuffer; if (obj) { - err_puts(m, dev_priv->engine[i]->name); + err_puts(&m, m.i915->engine[i]->name); if (ee->context.pid) - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", + err_printf(&m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", ee->context.comm, ee->context.pid, ee->context.handle, ee->context.hw_id, ee->context.ban_score, bannable(&ee->context)); - err_printf(m, " --- gtt_offset = 0x%08x %08x\n", + err_printf(&m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); - print_error_obj(m, dev_priv->engine[i], NULL, obj); + print_error_obj(&m, m.i915->engine[i], NULL, obj); } for (j = 0; j < ee->user_bo_count; j++) - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "user", ee->user_bo[j]); if (ee->num_requests) { - err_printf(m, "%s --- %d requests\n", - dev_priv->engine[i]->name, + err_printf(&m, "%s --- %d requests\n", + m.i915->engine[i]->name, ee->num_requests); for (j = 0; j < ee->num_requests; j++) - error_print_request(m, " ", + error_print_request(&m, " ", &ee->requests[j], error->epoch); } if (IS_ERR(ee->waiters)) { - err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n", - dev_priv->engine[i]->name); + err_printf(&m, "%s --- ? waiters [unable to acquire spinlock]\n", + m.i915->engine[i]->name); } else if (ee->num_waiters) { - err_printf(m, "%s --- %d waiters\n", - dev_priv->engine[i]->name, + err_printf(&m, "%s --- %d waiters\n", + m.i915->engine[i]->name, ee->num_waiters); for (j = 0; j < ee->num_waiters; j++) { - err_printf(m, " seqno 0x%08x for %s [%d]\n", + err_printf(&m, " seqno 0x%08x for %s [%d]\n", ee->waiters[j].seqno, ee->waiters[j].comm, ee->waiters[j].pid); } } - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "ringbuffer", ee->ringbuffer); - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "HW Status", ee->hws_page); - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "HW context", ee->ctx); - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "WA context", ee->wa_ctx); - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "WA batchbuffer", ee->wa_batchbuffer); - print_error_obj(m, dev_priv->engine[i], + print_error_obj(&m, m.i915->engine[i], "NULL context", ee->default_state); } if (error->overlay) - intel_overlay_print_error_state(m, error->overlay); + intel_overlay_print_error_state(&m, error->overlay); if (error->display) - intel_display_print_error_state(m, error->display); + intel_display_print_error_state(&m, error->display); - err_print_capabilities(m, &error->device_info, &error->driver_caps); - err_print_params(m, &error->params); - err_print_uc(m, &error->uc); + err_print_capabilities(&m, &error->device_info, &error->driver_caps); + err_print_params(&m, &error->params); + err_print_uc(&m, &error->uc); - if (m->bytes == 0 && m->err) - return m->err; + if (m.buf) { + __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); + m.bytes = 0; + m.buf = 0; + } + if (m.cur) { + GEM_BUG_ON(m.end < m.cur); + sg_mark_end(m.cur - 1); + } + GEM_BUG_ON(m.sgl && !m.cur); + + if (m.err) { + err_free_sgl(m.sgl); + return m.err; + } + + if (cmpxchg(&error->sgl, NULL, m.sgl)) + err_free_sgl(m.sgl); return 0; } -int i915_error_state_buf_init(struct drm_i915_error_state_buf *ebuf, - struct drm_i915_private *i915, - size_t count, loff_t pos) +ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, + char *buf, loff_t off, size_t rem) { - memset(ebuf, 0, sizeof(*ebuf)); - ebuf->i915 = i915; + struct scatterlist *sg; + size_t count; + loff_t pos; + int err; - /* We need to have enough room to store any i915_error_state printf - * so that we can move it to start position. - */ - ebuf->size = count + 1 > PAGE_SIZE ? count + 1 : PAGE_SIZE; - ebuf->buf = kmalloc(ebuf->size, - GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + if (!error || !rem) + return 0; - if (ebuf->buf == NULL) { - ebuf->size = PAGE_SIZE; - ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL); - } + err = err_print_to_sgl(error); + if (err) + return err; - if (ebuf->buf == NULL) { - ebuf->size = 128; - ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL); - } + sg = READ_ONCE(error->fit); + if (!sg || off < sg->dma_address) + sg = error->sgl; + if (!sg) + return 0; - if (ebuf->buf == NULL) - return -ENOMEM; + pos = sg->dma_address; + count = 0; + do { + size_t len, start; - ebuf->start = pos; + if (sg_is_chain(sg)) { + sg = sg_chain_ptr(sg); + GEM_BUG_ON(sg_is_chain(sg)); + } - return 0; + len = sg->length; + if (pos + len <= off) { + pos += len; + continue; + } + + start = sg->offset; + if (pos < off) { + GEM_BUG_ON(off - pos > len); + len -= off - pos; + start += off - pos; + pos = off; + } + + len = min(len, rem); + GEM_BUG_ON(!len || len > sg->length); + + memcpy(buf, page_address(sg_page(sg)) + start, len); + + count += len; + pos += len; + + buf += len; + rem -= len; + if (!rem) { + WRITE_ONCE(error->fit, sg); + break; + } + } while (!sg_is_last(sg++)); + + return count; } static void i915_error_object_free(struct drm_i915_error_object *obj) @@ -937,6 +1004,7 @@ void __i915_gpu_state_free(struct kref *error_ref) cleanup_params(error); cleanup_uc_state(error); + err_free_sgl(error->sgl); kfree(error); } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index f893a4e8b783..1c1bc0c23468 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -191,6 +191,8 @@ struct i915_gpu_state { } *active_bo[I915_NUM_ENGINES], *pinned_bo; u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count; struct i915_address_space *active_vm[I915_NUM_ENGINES]; + + struct scatterlist *sgl, *fit; }; struct i915_gpu_error { @@ -297,29 +299,20 @@ struct i915_gpu_error { struct drm_i915_error_state_buf { struct drm_i915_private *i915; - unsigned int bytes; - unsigned int size; + struct scatterlist *sgl, *cur, *end; + + char *buf; + size_t bytes; + size_t size; + loff_t iter; + int err; - u8 *buf; - loff_t start; - loff_t pos; }; #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); -int i915_error_state_to_str(struct drm_i915_error_state_buf *estr, - const struct i915_gpu_state *gpu); -int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb, - struct drm_i915_private *i915, - size_t count, loff_t pos); - -static inline void -i915_error_state_buf_release(struct drm_i915_error_state_buf *eb) -{ - kfree(eb->buf); -} struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915); void i915_capture_error_state(struct drm_i915_private *dev_priv, @@ -333,6 +326,9 @@ i915_gpu_state_get(struct i915_gpu_state *gpu) return gpu; } +ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, + char *buf, loff_t offset, size_t count); + void __i915_gpu_state_free(struct kref *kref); static inline void i915_gpu_state_put(struct i915_gpu_state *gpu) { diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c index e5e6f6bb2b05..ae63a7d0f51d 100644 --- a/drivers/gpu/drm/i915/i915_sysfs.c +++ b/drivers/gpu/drm/i915/i915_sysfs.c @@ -516,26 +516,21 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj, { struct device *kdev = kobj_to_dev(kobj); - struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); - struct drm_i915_error_state_buf error_str; + struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); struct i915_gpu_state *gpu; ssize_t ret; - ret = i915_error_state_buf_init(&error_str, dev_priv, count, off); - if (ret) - return ret; - - gpu = i915_first_error_state(dev_priv); - ret = i915_error_state_to_str(&error_str, gpu); - if (ret) - goto out; - - ret = count < error_str.bytes ? count : error_str.bytes; - memcpy(buf, error_str.buf, ret); + gpu = i915_first_error_state(i915); + if (gpu) { + ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count); + i915_gpu_state_put(gpu); + } else { + const char *str = "No error state collected\n"; + size_t len = strlen(str); -out: - i915_gpu_state_put(gpu); - i915_error_state_buf_release(&error_str); + ret = min_t(size_t, count, len - off); + memcpy(buf, str + off, ret); + } return ret; }