[RFC,08/12] drm/i915: Expose per-engine client busyness

Message ID	20200309183129.2296-9-tvrtko.ursulin@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=2DZA=42=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org D9725208C3 From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> To: Intel-gfx@lists.freedesktop.org Date: Mon, 9 Mar 2020 18:31:25 +0000 Message-Id: <20200309183129.2296-9-tvrtko.ursulin@linux.intel.com> In-Reply-To: <20200309183129.2296-1-tvrtko.ursulin@linux.intel.com> References: <20200309183129.2296-1-tvrtko.ursulin@linux.intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [RFC 08/12] drm/i915: Expose per-engine client busyness Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	Per client engine busyness \| expand [RFC,00/12] Per client engine busyness [RFC,01/12] drm/i915: Expose list of clients in sysfs [RFC,02/12] drm/i915: Update client name on context create [RFC,03/12] drm/i915: Make GEM contexts track DRM clients [RFC,04/12] drm/i915: Use explicit flag to mark unreachable intel_context [RFC,05/12] drm/i915: Track runtime spent in unreachable intel_contexts [RFC,06/12] drm/i915: Track runtime spent in closed GEM contexts [RFC,07/12] drm/i915: Track all user contexts per client [RFC,08/12] drm/i915: Expose per-engine client busyness [RFC,09/12] drm/i915: Track per-context engine busyness [RFC,10/12] drm/i915: Carry over past software tracked context runtime [RFC,11/12] drm/i915: Prefer software tracked context busyness [RFC,12/12] compare runtimes

Message ID

20200309183129.2296-9-tvrtko.ursulin@linux.intel.com (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org D9725208C3
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Intel-gfx@lists.freedesktop.org
Date: Mon,  9 Mar 2020 18:31:25 +0000
Message-Id: <20200309183129.2296-9-tvrtko.ursulin@linux.intel.com>
In-Reply-To: <20200309183129.2296-1-tvrtko.ursulin@linux.intel.com>
References: <20200309183129.2296-1-tvrtko.ursulin@linux.intel.com>
MIME-Version: 1.0
Subject: [Intel-gfx] [RFC 08/12] drm/i915: Expose per-engine client busyness
Precedence: list
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Series

Per client engine busyness | expand

Commit Message

Tvrtko Ursulin March 9, 2020, 6:31 p.m. UTC

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Expose per-client and per-engine busyness under the previously added sysfs
client root.

The new files are one per-engine instance and located under the 'busy'
directory. Each contains a monotonically increasing nano-second resolution
times each client's jobs were executing on the GPU.

This enables userspace to create a top-like tool for GPU utilization:

==========================================================================
intel-gpu-top -  935/ 935 MHz;    0% RC6; 14.73 Watts;     1097 irqs/s

      IMC reads:     1401 MiB/s
     IMC writes:        4 MiB/s

          ENGINE      BUSY                                 MI_SEMA MI_WAIT
     Render/3D/0   63.73% |███████████████████           |      3%      0%
       Blitter/0    9.53% |██▊                           |      6%      0%
         Video/0   39.32% |███████████▊                  |     16%      0%
         Video/1   15.62% |████▋                         |      0%      0%
  VideoEnhance/0    0.00% |                              |      0%      0%

  PID            NAME     RCS          BCS          VCS         VECS
 4084        gem_wsim |█████▌     ||█          ||           ||           |
 4086        gem_wsim |█▌         ||           ||███        ||           |
==========================================================================

v2: Use intel_context_engine_get_busy_time.
v3: New directory structure.
v4: Rebase.
v5: sysfs_attr_init.
v6: Small tidy in i915_gem_add_client.
v7: Rebase to be engine class based.
v8:
 * Always enable stats.
 * Walk all client contexts.
v9:
 * Skip unsupported engine classes. (Chris)
 * Use scheduler caps. (Chris)
v10:
 * Use pphwsp runtime only.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drm_client.c | 90 +++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drm_client.h | 11 ++++
 2 files changed, 100 insertions(+), 1 deletion(-)

Comments

Chris Wilson March 10, 2020, 6:32 p.m. UTC | #1

Quoting Tvrtko Ursulin (2020-03-09 18:31:25)
> +static ssize_t
> +show_client_busy(struct device *kdev, struct device_attribute *attr, char *buf)
> +{
> +       struct i915_engine_busy_attribute *i915_attr =
> +               container_of(attr, typeof(*i915_attr), attr);
> +       unsigned int class = i915_attr->engine_class;
> +       struct i915_drm_client *client = i915_attr->client;
> +       u64 total = atomic64_read(&client->past_runtime[class]);
> +       struct list_head *list = &client->ctx_list;
> +       struct i915_gem_context *ctx;
> +
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(ctx, list, client_link) {
> +               total += atomic64_read(&ctx->past_runtime[class]);
> +               total += pphwsp_busy_add(ctx, class);
> +       }
> +       rcu_read_unlock();
> +
> +       total *= RUNTIME_INFO(i915_attr->i915)->cs_timestamp_period_ns;

Planning early retirement? In 600 years, they'll have forgotten how to
email ;)
-Chris

Tvrtko Ursulin March 10, 2020, 8:04 p.m. UTC | #2

On 10/03/2020 18:32, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-03-09 18:31:25)
>> +static ssize_t
>> +show_client_busy(struct device *kdev, struct device_attribute *attr, char *buf)
>> +{
>> +       struct i915_engine_busy_attribute *i915_attr =
>> +               container_of(attr, typeof(*i915_attr), attr);
>> +       unsigned int class = i915_attr->engine_class;
>> +       struct i915_drm_client *client = i915_attr->client;
>> +       u64 total = atomic64_read(&client->past_runtime[class]);
>> +       struct list_head *list = &client->ctx_list;
>> +       struct i915_gem_context *ctx;
>> +
>> +       rcu_read_lock();
>> +       list_for_each_entry_rcu(ctx, list, client_link) {
>> +               total += atomic64_read(&ctx->past_runtime[class]);
>> +               total += pphwsp_busy_add(ctx, class);
>> +       }
>> +       rcu_read_unlock();
>> +
>> +       total *= RUNTIME_INFO(i915_attr->i915)->cs_timestamp_period_ns;
> 
> Planning early retirement? In 600 years, they'll have forgotten how to
> email ;)

Shruggety shrug. :) I am guessing you would prefer both internal 
representations (sw and pphwsp runtimes) to be consistently in 
nanoseconds? I thought why multiply at various places when once at the 
readout time is enough.

And I should mention again how I am not sure at the moment how to meld 
the two stats into one more "perfect" output.

Regards,

Tvrtko

Chris Wilson March 10, 2020, 8:12 p.m. UTC | #3

Quoting Tvrtko Ursulin (2020-03-10 20:04:23)
> 
> On 10/03/2020 18:32, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-03-09 18:31:25)
> >> +static ssize_t
> >> +show_client_busy(struct device *kdev, struct device_attribute *attr, char *buf)
> >> +{
> >> +       struct i915_engine_busy_attribute *i915_attr =
> >> +               container_of(attr, typeof(*i915_attr), attr);
> >> +       unsigned int class = i915_attr->engine_class;
> >> +       struct i915_drm_client *client = i915_attr->client;
> >> +       u64 total = atomic64_read(&client->past_runtime[class]);
> >> +       struct list_head *list = &client->ctx_list;
> >> +       struct i915_gem_context *ctx;
> >> +
> >> +       rcu_read_lock();
> >> +       list_for_each_entry_rcu(ctx, list, client_link) {
> >> +               total += atomic64_read(&ctx->past_runtime[class]);
> >> +               total += pphwsp_busy_add(ctx, class);
> >> +       }
> >> +       rcu_read_unlock();
> >> +
> >> +       total *= RUNTIME_INFO(i915_attr->i915)->cs_timestamp_period_ns;
> > 
> > Planning early retirement? In 600 years, they'll have forgotten how to
> > email ;)
> 
> Shruggety shrug. :) I am guessing you would prefer both internal 
> representations (sw and pphwsp runtimes) to be consistently in 
> nanoseconds? I thought why multiply at various places when once at the 
> readout time is enough.

It's fine. I was just double checking overflow, and then remembered the
end result is 64b nanoseconds.

Keep the internal representation convenient for accumulation, and the
conversion at the boundary.
 
> And I should mention again how I am not sure at the moment how to meld 
> the two stats into one more "perfect" output.

One of the things that crossed my mind was wondering if it was possible
to throw in a pulse before reading the stats (if active etc). Usual
dilemma with non-preemptible contexts, so probably not worth it as those
hogs will remain hogs.

And I worry about the disparity between sw busy and hw runtime.
-Chris

Tvrtko Ursulin March 11, 2020, 10:17 a.m. UTC | #4

On 10/03/2020 20:12, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-03-10 20:04:23)
>>
>> On 10/03/2020 18:32, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-03-09 18:31:25)
>>>> +static ssize_t
>>>> +show_client_busy(struct device *kdev, struct device_attribute *attr, char *buf)
>>>> +{
>>>> +       struct i915_engine_busy_attribute *i915_attr =
>>>> +               container_of(attr, typeof(*i915_attr), attr);
>>>> +       unsigned int class = i915_attr->engine_class;
>>>> +       struct i915_drm_client *client = i915_attr->client;
>>>> +       u64 total = atomic64_read(&client->past_runtime[class]);
>>>> +       struct list_head *list = &client->ctx_list;
>>>> +       struct i915_gem_context *ctx;
>>>> +
>>>> +       rcu_read_lock();
>>>> +       list_for_each_entry_rcu(ctx, list, client_link) {
>>>> +               total += atomic64_read(&ctx->past_runtime[class]);
>>>> +               total += pphwsp_busy_add(ctx, class);
>>>> +       }
>>>> +       rcu_read_unlock();
>>>> +
>>>> +       total *= RUNTIME_INFO(i915_attr->i915)->cs_timestamp_period_ns;
>>>
>>> Planning early retirement? In 600 years, they'll have forgotten how to
>>> email ;)
>>
>> Shruggety shrug. :) I am guessing you would prefer both internal
>> representations (sw and pphwsp runtimes) to be consistently in
>> nanoseconds? I thought why multiply at various places when once at the
>> readout time is enough.
> 
> It's fine. I was just double checking overflow, and then remembered the
> end result is 64b nanoseconds.
> 
> Keep the internal representation convenient for accumulation, and the
> conversion at the boundary.
>   
>> And I should mention again how I am not sure at the moment how to meld
>> the two stats into one more "perfect" output.
> 
> One of the things that crossed my mind was wondering if it was possible
> to throw in a pulse before reading the stats (if active etc). Usual
> dilemma with non-preemptible contexts, so probably not worth it as those
> hogs will remain hogs.
> 
> And I worry about the disparity between sw busy and hw runtime.

How about I stop tracking accumulated sw runtime and just use it for the 
active portion. So reporting back hw runtime + sw active runtime. In 
other words sw tracking only covers the portion between context_in and 
context_out. Sounds worth a try.

Regards,

Tvrtko

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index c9a510c6c6d4..6df5a21f5d4e 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -10,8 +10,13 @@ 
 
 #include <drm/drm_print.h>
 
+#include <uapi/drm/i915_drm.h>
+
 #include "i915_drv.h"
 #include "i915_drm_client.h"
+#include "gem/i915_gem_context.h"
+#include "gt/intel_engine_user.h"
+#include "i915_drv.h"
 #include "i915_gem.h"
 #include "i915_utils.h"
 
@@ -47,13 +52,61 @@  show_client_pid(struct device *kdev, struct device_attribute *attr, char *buf)
 	return ret;
 }
 
+static u64
+pphwsp_busy_add(struct i915_gem_context *ctx, unsigned int class)
+{
+	struct i915_gem_engines *engines = rcu_dereference(ctx->engines);
+	struct i915_gem_engines_iter it;
+	struct intel_context *ce;
+	u64 total = 0;
+
+	for_each_gem_engine(ce, engines, it) {
+		if (ce->engine->uabi_class == class)
+			total += ce->runtime.total;
+	}
+
+	return total;
+}
+
+static ssize_t
+show_client_busy(struct device *kdev, struct device_attribute *attr, char *buf)
+{
+	struct i915_engine_busy_attribute *i915_attr =
+		container_of(attr, typeof(*i915_attr), attr);
+	unsigned int class = i915_attr->engine_class;
+	struct i915_drm_client *client = i915_attr->client;
+	u64 total = atomic64_read(&client->past_runtime[class]);
+	struct list_head *list = &client->ctx_list;
+	struct i915_gem_context *ctx;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, list, client_link) {
+		total += atomic64_read(&ctx->past_runtime[class]);
+		total += pphwsp_busy_add(ctx, class);
+	}
+	rcu_read_unlock();
+
+	total *= RUNTIME_INFO(i915_attr->i915)->cs_timestamp_period_ns;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", total);
+}
+
+static const char *uabi_class_names[] = {
+	[I915_ENGINE_CLASS_RENDER] = "0",
+	[I915_ENGINE_CLASS_COPY] = "1",
+	[I915_ENGINE_CLASS_VIDEO] = "2",
+	[I915_ENGINE_CLASS_VIDEO_ENHANCE] = "3",
+};
+
 static int
 __i915_drm_client_register(struct i915_drm_client *client,
 			   struct task_struct *task)
 {
 	struct i915_drm_clients *clients = client->clients;
+	struct drm_i915_private *i915 =
+		container_of(clients, typeof(*i915), clients);
 	struct device_attribute *attr;
-	int ret = -ENOMEM;
+	int i, ret = -ENOMEM;
 	char idstr[32];
 	char *name;
 
@@ -92,8 +145,42 @@  __i915_drm_client_register(struct i915_drm_client *client,
 	if (ret)
 		goto err_attr;
 
+	if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
+		client->busy_root =
+			kobject_create_and_add("busy", client->root);
+		if (!client->busy_root)
+			goto err_attr;
+
+		for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++) {
+			struct i915_engine_busy_attribute *i915_attr =
+				&client->attr.busy[i];
+
+			if (!intel_engine_lookup_user(i915, i, 0))
+				continue;
+
+			i915_attr->client = client;
+			i915_attr->i915 = i915;
+			i915_attr->engine_class = i;
+
+			attr = &i915_attr->attr;
+
+			sysfs_attr_init(&attr->attr);
+
+			attr->attr.name = uabi_class_names[i];
+			attr->attr.mode = 0444;
+			attr->show = show_client_busy;
+
+			ret = sysfs_create_file(client->busy_root,
+						(struct attribute *)attr);
+			if (ret)
+				goto err_busy;
+		}
+	}
+
 	return 0;
 
+err_busy:
+	kobject_put(client->busy_root);
 err_attr:
 	kobject_put(client->root);
 err_client:
@@ -113,6 +200,7 @@  __i915_drm_client_unregister(struct i915_drm_client *client)
 	if (!client->root)
 		return; /* fbdev client or error during drm open */
 
+	kobject_put(fetch_and_zero(&client->busy_root));
 	kobject_put(fetch_and_zero(&client->root));
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drm_client.h b/drivers/gpu/drm/i915/i915_drm_client.h
index 0a9f2c0c12dd..da83259170e7 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.h
+++ b/drivers/gpu/drm/i915/i915_drm_client.h
@@ -28,6 +28,15 @@  struct i915_drm_clients {
 	struct kobject *root;
 };
 
+struct i915_drm_client;
+
+struct i915_engine_busy_attribute {
+	struct device_attribute attr;
+	struct drm_i915_private *i915;
+	struct i915_drm_client *client;
+	unsigned int engine_class;
+};
+
 struct i915_drm_client {
 	struct kref kref;
 
@@ -46,9 +55,11 @@  struct i915_drm_client {
 	struct i915_drm_clients *clients;
 
 	struct kobject *root;
+	struct kobject *busy_root;
 	struct {
 		struct device_attribute pid;
 		struct device_attribute name;
+		struct i915_engine_busy_attribute busy[MAX_ENGINE_CLASS + 1];
 	} attr;
 
 	/**

[RFC,08/12] drm/i915: Expose per-engine client busyness

Commit Message

Comments

Patch