From patchwork Tue Nov 15 14:36:34 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mika Kuoppala X-Patchwork-Id: 9429927 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id B454160484 for ; Tue, 15 Nov 2016 14:37:02 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id A02DF28950 for ; Tue, 15 Nov 2016 14:37:02 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 94F3628B3E; Tue, 15 Nov 2016 14:37:02 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-4.2 required=2.0 tests=BAYES_00, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 2168D28956 for ; Tue, 15 Nov 2016 14:37:02 +0000 (UTC) Received: from gabe.freedesktop.org (localhost [127.0.0.1]) by gabe.freedesktop.org (Postfix) with ESMTP id B8BD36E5C8; Tue, 15 Nov 2016 14:37:01 +0000 (UTC) X-Original-To: intel-gfx@lists.freedesktop.org Delivered-To: intel-gfx@lists.freedesktop.org Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by gabe.freedesktop.org (Postfix) with ESMTPS id 07E9E6E5BD for ; Tue, 15 Nov 2016 14:36:49 +0000 (UTC) Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by orsmga101.jf.intel.com with ESMTP; 15 Nov 2016 06:36:48 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.31,495,1473145200"; d="scan'208";a="786686332" Received: from rosetta.fi.intel.com ([10.237.72.176]) by FMSMGA003.fm.intel.com with ESMTP; 15 Nov 2016 06:36:46 -0800 Received: by rosetta.fi.intel.com (Postfix, from userid 1000) id 03303840012; Tue, 15 Nov 2016 16:36:37 +0200 (EET) From: Mika Kuoppala To: intel-gfx@lists.freedesktop.org Date: Tue, 15 Nov 2016 16:36:34 +0200 Message-Id: <1479220596-2784-4-git-send-email-mika.kuoppala@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1479220596-2784-1-git-send-email-mika.kuoppala@intel.com> References: <1479220596-2784-1-git-send-email-mika.kuoppala@intel.com> Subject: [Intel-gfx] [PATCH 4/6] drm/i915: Add bannable context parameter X-BeenThere: intel-gfx@lists.freedesktop.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: Intel graphics driver community testing & development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" X-Virus-Scanned: ClamAV using ClamSMTP Now when driver has per context scoring of 'hanging badness' and also subsequent hangs during short windows are allowed, if there is progress made in between, it does not make sense to expose a ban timing window as a context parameter anymore. Let the scoring be the sole indicator for ban policy and substitute ban period context parameter as a boolean to get/set context bannable property. Cc: Chris Wilson Suggested-by: Chris Wilson Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson --- drivers/gpu/drm/i915/i915_drv.h | 14 +++----------- drivers/gpu/drm/i915/i915_gem.c | 10 +--------- drivers/gpu/drm/i915/i915_gem_context.c | 23 ++++++++++++++--------- drivers/gpu/drm/i915/i915_gpu_error.c | 5 +++-- include/uapi/drm/i915_drm.h | 1 + 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6bc9d0b..5af1c38 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -832,6 +832,7 @@ struct drm_i915_error_state { long jiffies; pid_t pid; u32 context; + int ban_score; u32 seqno; u32 head; u32 tail; @@ -891,16 +892,10 @@ struct i915_ctx_hang_stats { /* This context had batch active when hang was declared */ unsigned batch_active; - /* Time when this context was last blamed for a GPU reset */ - unsigned long guilty_ts; - - /* If the contexts causes a second GPU hang within this time, - * it is permanently banned from submitting any more work. - */ - unsigned long ban_period_seconds; + bool bannable:1; /* This context is banned to submit more work */ - bool banned; + bool banned:1; /* Accumulated score of hangs caused by this context */ int ban_score; @@ -1437,9 +1432,6 @@ struct i915_gpu_error { #define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */ #define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */ -/* Hang gpu twice in this window and your context gets banned */ -#define DRM_I915_CTX_BAN_PERIOD_SEC 12 - #define HANGCHECK_STUCK_JIFFIES (DRM_I915_STUCK_PERIOD_SEC * HZ) #define HANGCHECK_HUNG_JIFFIES (DRM_I915_HUNG_PERIOD_SEC * HZ) #define HANGCHECK_PERIOD_JIFFIES msecs_to_jiffies(1500) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 42a0f96..40a9e10 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2621,20 +2621,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, static bool i915_context_is_banned(const struct i915_gem_context *ctx) { const struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - unsigned long elapsed; if (hs->banned) return true; - if (!hs->ban_period_seconds) + if (!hs->bannable) return false; - elapsed = get_seconds() - hs->guilty_ts; - if (elapsed <= hs->ban_period_seconds) { - DRM_DEBUG("context hanging too fast, banning!\n"); - return true; - } - if (hs->ban_score >= 40) { DRM_DEBUG("context hanging too often, banning!\n"); return true; @@ -2651,7 +2644,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) hs->banned = i915_context_is_banned(ctx); hs->batch_active++; - hs->guilty_ts = get_seconds(); } static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 958a526..9abaae4 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev, * is no remap info, it will be a NOP. */ ctx->remap_slice = ALL_L3_SLICES(dev_priv); - ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD_SEC; + ctx->hang_stats.bannable = true; ctx->ring_size = 4 * PAGE_SIZE; ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) << GEN8_CTX_ADDRESSING_MODE_SHIFT; @@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->size = 0; switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: - args->value = ctx->hang_stats.ban_period_seconds; + ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: args->value = ctx->flags & CONTEXT_NO_ZEROMAP; @@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); break; + case I915_CONTEXT_PARAM_BANNABLE: + args->value = ctx->hang_stats.bannable; + break; default: ret = -EINVAL; break; @@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: - if (args->size) - ret = -EINVAL; - else if (args->value < ctx->hang_stats.ban_period_seconds && - !capable(CAP_SYS_ADMIN)) - ret = -EPERM; - else - ctx->hang_stats.ban_period_seconds = args->value; + ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: if (args->size) { @@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE; } break; + case I915_CONTEXT_PARAM_BANNABLE: + if (args->size) + ret = -EINVAL; + else if (!capable(CAP_SYS_ADMIN)) + ret = -EPERM; + else + ctx->hang_stats.bannable = args->value; + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 8d0f2bc..5d2e233 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n", - prefix, erq->pid, + err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n", + prefix, erq->pid, erq->ban_score, erq->context, erq->seqno, jiffies_to_msecs(jiffies - erq->jiffies), erq->head, erq->tail); @@ -1168,6 +1168,7 @@ static void record_request(struct drm_i915_gem_request *request, struct drm_i915_error_request *erq) { erq->context = request->ctx->hw_id; + erq->ban_score = request->ctx->hang_stats.ban_score; erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->head = request->head; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 1c12a35..12003f0 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 +#define I915_CONTEXT_PARAM_BANNABLE 0x5 __u64 value; };