diff mbox series

[RFC,157/162] drm/i915: Improve accuracy of eviction stats

Message ID 20201127120718.454037-158-matthew.auld@intel.com (mailing list archive)
State New, archived
Headers show
Series DG1 + LMEM enabling | expand

Commit Message

Matthew Auld Nov. 27, 2020, 12:07 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Current code uses jiffie time to do the accounting and then does:

  diff = jiffies - start;
  msec = diff * 1000 / HZ;
  ...
  atomic_long_add(msec, &i915->time_swap_out_ms);

If we assume jiffie can be as non-granular as 10ms and that the current
accounting records all evictions faster than one jiffie as infinite speed,
we can end up over-estimating the reported eviction throughput.

Fix this by accumulating ktime_t and only dividing to more user friendly
granularity at presentation time (debugfs read).

At the same time consolidate the code a bit and convert from multiple
atomics to single seqlock per stat.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: CQ Tang <cq.tang@intel.com>
Cc: Sudeep Dutt <sudeep.dutt@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_region.c | 67 ++++++++++----------
 drivers/gpu/drm/i915/i915_debugfs.c        | 73 +++++++++++-----------
 drivers/gpu/drm/i915/i915_drv.h            | 25 +++++---
 drivers/gpu/drm/i915/i915_gem.c            |  5 ++
 4 files changed, 90 insertions(+), 80 deletions(-)

Comments

Chris Wilson Nov. 27, 2020, 2:40 p.m. UTC | #1
Quoting Matthew Auld (2020-11-27 12:07:13)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Current code uses jiffie time to do the accounting and then does:
> 
>   diff = jiffies - start;
>   msec = diff * 1000 / HZ;
>   ...
>   atomic_long_add(msec, &i915->time_swap_out_ms);
> 
> If we assume jiffie can be as non-granular as 10ms and that the current
> accounting records all evictions faster than one jiffie as infinite speed,
> we can end up over-estimating the reported eviction throughput.
> 
> Fix this by accumulating ktime_t and only dividing to more user friendly
> granularity at presentation time (debugfs read).
> 
> At the same time consolidate the code a bit and convert from multiple
> atomics to single seqlock per stat.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: CQ Tang <cq.tang@intel.com>
> Cc: Sudeep Dutt <sudeep.dutt@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

A lot of effort to fix up patches after the fact, might as well make it
a real PMU interface.
-Chris
Tvrtko Ursulin Nov. 30, 2020, 10:36 a.m. UTC | #2
On 27/11/2020 14:40, Chris Wilson wrote:
> Quoting Matthew Auld (2020-11-27 12:07:13)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Current code uses jiffie time to do the accounting and then does:
>>
>>    diff = jiffies - start;
>>    msec = diff * 1000 / HZ;
>>    ...
>>    atomic_long_add(msec, &i915->time_swap_out_ms);
>>
>> If we assume jiffie can be as non-granular as 10ms and that the current
>> accounting records all evictions faster than one jiffie as infinite speed,
>> we can end up over-estimating the reported eviction throughput.
>>
>> Fix this by accumulating ktime_t and only dividing to more user friendly
>> granularity at presentation time (debugfs read).
>>
>> At the same time consolidate the code a bit and convert from multiple
>> atomics to single seqlock per stat.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: CQ Tang <cq.tang@intel.com>
>> Cc: Sudeep Dutt <sudeep.dutt@intel.com>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> 
> A lot of effort to fix up patches after the fact, might as well make it
> a real PMU interface.

It did cross my mind and should be easy to add on top if deemed useful 
or interesting.

More importantly, it is okay with me to incorporate this patch into the 
earlier one(s) which first added statistics.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_region.c b/drivers/gpu/drm/i915/gem/i915_gem_region.c
index 8ec59fbaa3e6..1a390e502d5a 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_region.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_region.c
@@ -9,14 +9,29 @@ 
 #include "i915_trace.h"
 #include "i915_gem_mman.h"
 
+static void
+__update_stat(struct i915_mm_swap_stat *stat,
+	      unsigned long pages,
+	      ktime_t start)
+{
+	if (stat) {
+		start = ktime_get() - start;
+
+		write_seqlock(&stat->lock);
+		stat->time = ktime_add(stat->time, start);
+		stat->pages += pages;
+		write_sequnlock(&stat->lock);
+	}
+}
+
 static int
 i915_gem_object_swapout_pages(struct drm_i915_gem_object *obj,
 			      struct sg_table *pages, unsigned int sizes)
 {
 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct i915_mm_swap_stat *stat = NULL;
 	struct drm_i915_gem_object *dst, *src;
-	unsigned long start, diff, msec;
-	bool blt_completed = false;
+	ktime_t start = ktime_get();
 	int err = -EINVAL;
 
 	GEM_BUG_ON(obj->swapto);
@@ -26,7 +41,6 @@  i915_gem_object_swapout_pages(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(!i915->params.enable_eviction);
 
 	assert_object_held(obj);
-	start = jiffies;
 
 	/* create a shadow object on smem region */
 	dst = i915_gem_object_create_shmem(i915, obj->base.size);
@@ -58,10 +72,14 @@  i915_gem_object_swapout_pages(struct drm_i915_gem_object *obj,
 	if (i915->params.enable_eviction >= 2) {
 		err = i915_window_blt_copy(dst, src);
 		if (!err)
-			blt_completed = true;
+			stat = &i915->mm.blt_swap_stats.out;
 	}
-	if (err && i915->params.enable_eviction != 2)
+
+	if (err && i915->params.enable_eviction != 2) {
 		err = i915_gem_object_memcpy(dst, src);
+		if (!err)
+			stat = &i915->mm.memcpy_swap_stats.out;
+	}
 
 	__i915_gem_object_unpin_pages(src);
 	__i915_gem_object_unset_pages(src);
@@ -73,18 +91,7 @@  i915_gem_object_swapout_pages(struct drm_i915_gem_object *obj,
 	else
 		i915_gem_object_put(dst);
 
-	if (!err) {
-		diff = jiffies - start;
-		msec = diff * 1000 / HZ;
-		if (blt_completed) {
-			atomic_long_add(sizes, &i915->num_bytes_swapped_out);
-			atomic_long_add(msec, &i915->time_swap_out_ms);
-		} else {
-			atomic_long_add(sizes,
-					&i915->num_bytes_swapped_out_memcpy);
-			atomic_long_add(msec, &i915->time_swap_out_ms_memcpy);
-		}
-	}
+	__update_stat(stat, sizes >> PAGE_SHIFT, start);
 
 	return err;
 }
@@ -94,9 +101,9 @@  i915_gem_object_swapin_pages(struct drm_i915_gem_object *obj,
 			     struct sg_table *pages, unsigned int sizes)
 {
 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct i915_mm_swap_stat *stat = NULL;
 	struct drm_i915_gem_object *dst, *src;
-	unsigned long start, diff, msec;
-	bool blt_completed = false;
+	ktime_t start = ktime_get();
 	int err = -EINVAL;
 
 	GEM_BUG_ON(!obj->swapto);
@@ -106,7 +113,6 @@  i915_gem_object_swapin_pages(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(!i915->params.enable_eviction);
 
 	assert_object_held(obj);
-	start = jiffies;
 
 	src = obj->swapto;
 
@@ -134,10 +140,14 @@  i915_gem_object_swapin_pages(struct drm_i915_gem_object *obj,
 	if (i915->params.enable_eviction >= 2) {
 		err = i915_window_blt_copy(dst, src);
 		if (!err)
-			blt_completed = true;
+			stat = &i915->mm.blt_swap_stats.in;
 	}
-	if (err && i915->params.enable_eviction != 2)
+
+	if (err && i915->params.enable_eviction != 2) {
 		err = i915_gem_object_memcpy(dst, src);
+		if (!err)
+			stat = &i915->mm.memcpy_swap_stats.in;
+	}
 
 	__i915_gem_object_unpin_pages(dst);
 	__i915_gem_object_unset_pages(dst);
@@ -149,18 +159,7 @@  i915_gem_object_swapin_pages(struct drm_i915_gem_object *obj,
 		i915_gem_object_put(src);
 	}
 
-	if (!err) {
-		diff = jiffies - start;
-		msec = diff * 1000 / HZ;
-		if (blt_completed) {
-			atomic_long_add(sizes, &i915->num_bytes_swapped_in);
-			atomic_long_add(msec, &i915->time_swap_in_ms);
-		} else {
-			atomic_long_add(sizes,
-					&i915->num_bytes_swapped_in_memcpy);
-			atomic_long_add(msec, &i915->time_swap_in_ms_memcpy);
-		}
-	}
+	__update_stat(stat, sizes >> PAGE_SHIFT, start);
 
 	return err;
 }
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 983030ac39e1..f06f900b598e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -359,12 +359,46 @@  static void print_context_stats(struct seq_file *m,
 	print_file_stats(m, "[k]contexts", kstats);
 }
 
+static void
+evict_stat(struct seq_file *m,
+	   const char *name,
+	   const char *direction,
+	   struct i915_mm_swap_stat *stat)
+{
+	unsigned long pages;
+	unsigned int seq;
+	u64 time, rate;
+	ktime_t ktime;
+
+	do {
+		seq = read_seqbegin(&stat->lock);
+		pages = stat->pages;
+		ktime = stat->time;
+	} while (read_seqretry(&stat->lock, seq));
+
+	time = ktime_to_us(ktime);
+	rate = time ? div64_u64((u64)pages * PAGE_SIZE, time) : 0;
+	rate = div64_ul(rate * USEC_PER_SEC, 1024 * 1024);
+
+	seq_printf(m, "%s swap %s %lu MiB in %llums, %llu MiB/s.\n",
+		   name, direction, pages * PAGE_SIZE, ktime_to_ms(ktime),
+		   rate);
+}
+
+static void
+evict_stats(struct seq_file *m,
+	    const char *name,
+	    struct i915_mm_swap_stats *stats)
+{
+	evict_stat(m, name, "in", &stats->in);
+	evict_stat(m, name, "out", &stats->out);
+}
+
 static int i915_gem_object_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *i915 = node_to_i915(m->private);
 	struct intel_memory_region *mr;
 	enum intel_region_id id;
-	u64 time, bytes, rate;
 
 	seq_printf(m, "%u shrinkable [%u free] objects, %llu bytes\n",
 		   i915->mm.shrink_count,
@@ -374,41 +408,8 @@  static int i915_gem_object_info(struct seq_file *m, void *data)
 		seq_printf(m, "%s: total:%pa, available:%pa bytes\n",
 			   mr->name, &mr->total, &mr->avail);
 
-	time = atomic_long_read(&i915->time_swap_out_ms);
-	bytes = atomic_long_read(&i915->num_bytes_swapped_out);
-	if (time)
-		rate = div64_u64(bytes * 1000, time * 1024 * 1024);
-	else
-		rate = 0;
-	seq_printf(m, "BLT: swapout %llu Bytes in %llu mSec(%llu MB/Sec)\n",
-		   bytes, time, rate);
-
-	time = atomic_long_read(&i915->time_swap_in_ms);
-	bytes = atomic_long_read(&i915->num_bytes_swapped_in);
-	if (time)
-		rate = div64_u64(bytes * 1000, time * 1024 * 1024);
-	else
-		rate = 0;
-	seq_printf(m, "BLT: swapin %llu Bytes in %llu mSec(%llu MB/Sec)\n",
-		   bytes, time, rate);
-
-	time = atomic_long_read(&i915->time_swap_out_ms_memcpy);
-	bytes = atomic_long_read(&i915->num_bytes_swapped_out_memcpy);
-	if (time)
-		rate = div64_u64(bytes * 1000, time * 1024 * 1024);
-	else
-		rate = 0;
-	seq_printf(m, "Memcpy: swapout %llu Bytes in %llu mSec(%llu MB/Sec)\n",
-		   bytes, time, rate);
-
-	time = atomic_long_read(&i915->time_swap_in_ms_memcpy);
-	bytes = atomic_long_read(&i915->num_bytes_swapped_in_memcpy);
-	if (time)
-		rate = div64_u64(bytes * 1000, time * 1024 * 1024);
-	else
-		rate = 0;
-	seq_printf(m, "Memcpy: swapin %llu Bytes in %llu mSec(%llu MB/Sec)\n",
-		   bytes, time, rate);
+	evict_stats(m, "Blitter", &i915->mm.blt_swap_stats);
+	evict_stats(m, "Memcpy", &i915->mm.memcpy_swap_stats);
 	seq_putc(m, '\n');
 
 	print_context_stats(m, i915);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6f0ab363bdee..45511f2d8da0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -49,6 +49,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/stackdepot.h>
 #include <linux/xarray.h>
+#include <linux/seqlock.h>
 
 #include <drm/intel-gtt.h>
 #include <drm/drm_legacy.h> /* for struct drm_dma_handle */
@@ -548,6 +549,17 @@  struct intel_l3_parity {
 	int which_slice;
 };
 
+struct i915_mm_swap_stat {
+	seqlock_t lock;
+	unsigned long pages;
+	ktime_t time;
+};
+
+struct i915_mm_swap_stats {
+	struct i915_mm_swap_stat in;
+	struct i915_mm_swap_stat out;
+};
+
 struct i915_gem_mm {
 	/* Protects bound_list/unbound_list and #drm_i915_gem_object.mm.link */
 	spinlock_t obj_lock;
@@ -601,6 +613,9 @@  struct i915_gem_mm {
 
 	/* To protect above two set of vmas */
 	wait_queue_head_t window_queue;
+
+	struct i915_mm_swap_stats blt_swap_stats;
+	struct i915_mm_swap_stats memcpy_swap_stats;
 };
 
 #define I915_IDLE_ENGINES_TIMEOUT (200) /* in ms */
@@ -1220,16 +1235,6 @@  struct drm_i915_private {
 	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
 	 * will be rejected. Instead look for a better place.
 	 */
-
-	atomic_long_t num_bytes_swapped_out;
-	atomic_long_t num_bytes_swapped_in;
-	atomic_long_t time_swap_out_ms;
-	atomic_long_t time_swap_in_ms;
-
-	atomic_long_t num_bytes_swapped_out_memcpy;
-	atomic_long_t num_bytes_swapped_in_memcpy;
-	atomic_long_t time_swap_out_ms_memcpy;
-	atomic_long_t time_swap_in_ms_memcpy;
 };
 
 static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 85cbdb8e2bb8..e94f3f689b30 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1151,6 +1151,11 @@  static void i915_gem_init__mm(struct drm_i915_private *i915)
 	INIT_LIST_HEAD(&i915->mm.purge_list);
 	INIT_LIST_HEAD(&i915->mm.shrink_list);
 
+	seqlock_init(&i915->mm.blt_swap_stats.in.lock);
+	seqlock_init(&i915->mm.blt_swap_stats.out.lock);
+	seqlock_init(&i915->mm.memcpy_swap_stats.in.lock);
+	seqlock_init(&i915->mm.memcpy_swap_stats.out.lock);
+
 	i915_gem_init__objects(i915);
 }