diff mbox

[i-g-t] tests/perf_pmu: Test busyness reporting in face of GPU hangs

Message ID 20180219191251.29766-1-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin Feb. 19, 2018, 7:12 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Verify that the reported busyness is in line with what would we expect
from a batch which causes a hang and gets kicked out from the engine.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tests/perf_pmu.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

Comments

Chris Wilson Feb. 19, 2018, 7:21 p.m. UTC | #1
Quoting Tvrtko Ursulin (2018-02-19 19:12:51)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Verify that the reported busyness is in line with what would we expect
> from a batch which causes a hang and gets kicked out from the engine.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  tests/perf_pmu.c | 41 +++++++++++++++++++++++++++++++++++------
>  1 file changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 7fab73e22c2d..90b6ec4db32d 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -168,6 +168,7 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
>  #define TEST_TRAILING_IDLE (4)
>  #define TEST_RUNTIME_PM (8)
>  #define FLAG_LONG (16)
> +#define FLAG_HANG (32)
>  
>  static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>  {
> @@ -186,11 +187,15 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>  static void
>  single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
>  {
> +       const unsigned int hang_us = 10e6;
>         unsigned long slept;
>         igt_spin_t *spin;
> -       uint64_t val;
> +       uint64_t val[2], ts[2];
>         int fd;
>  
> +       if (flags & FLAG_HANG)
> +               gem_quiescent_gpu(gem_fd);
> +
>         fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
>  
>         if (flags & TEST_BUSY)
> @@ -198,17 +203,36 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
>         else
>                 spin = NULL;
>  
> -       val = pmu_read_single(fd);
> -       slept = measured_usleep(batch_duration_ns / 1000);
> +       val[0] = __pmu_read_single(fd, &ts[0]);
> +       slept = measured_usleep(flags & FLAG_HANG ?
> +                               hang_us : batch_duration_ns / 1000);
>         if (flags & TEST_TRAILING_IDLE)
>                 end_spin(gem_fd, spin, flags);
> -       val = pmu_read_single(fd) - val;
> +       val[1] = pmu_read_single(fd);
>  
>         end_spin(gem_fd, spin, FLAG_SYNC);
>         igt_spin_batch_free(gem_fd, spin);
> -       close(fd);
>  
> -       assert_within_epsilon(val, flags & TEST_BUSY ? slept : 0.f, tolerance);
> +       if ((flags & TEST_BUSY) && (flags & FLAG_HANG)) {
> +               val[1] = __pmu_read_single(fd, &ts[1]);
> +               close(fd);
> +               igt_info("sampled with hang %.3fms / %.3fms\n",
> +                        (val[1] - val[0]) / 1e6, (ts[1] - ts[0]) / 1e6);
> +               /* Check that some busyness was reported. */
> +               igt_assert(val[1] - val[0] > 0);
> +               /*
> +                * But not more than some reasonable value before which we
> +                * expected the spinner to be kicked out.
> +                */

So 120s? And even that carries internal knowledge from across the ages.

I don't think this is a sensible test. What would be reasonable is
something like

	spinner()
	val[0] = pmu()
	sleep()
	igt_force_gpu_reset()
	val[1] = pmu();
	d_busy = val[1] - val[0]
	sleep()
	val[2] = pmu()
	d_idle = val[2] - val[1];

Then d_busy should be d_ts, and d_idle should be 0. i.e. the
igt_force_gpu_reset() is just an indirect igt_spin_batch_end().
-Chris
Tvrtko Ursulin Feb. 19, 2018, 7:26 p.m. UTC | #2
On 19/02/2018 19:21, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-02-19 19:12:51)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Verify that the reported busyness is in line with what would we expect
>> from a batch which causes a hang and gets kicked out from the engine.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   tests/perf_pmu.c | 41 +++++++++++++++++++++++++++++++++++------
>>   1 file changed, 35 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> index 7fab73e22c2d..90b6ec4db32d 100644
>> --- a/tests/perf_pmu.c
>> +++ b/tests/perf_pmu.c
>> @@ -168,6 +168,7 @@ static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
>>   #define TEST_TRAILING_IDLE (4)
>>   #define TEST_RUNTIME_PM (8)
>>   #define FLAG_LONG (16)
>> +#define FLAG_HANG (32)
>>   
>>   static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>>   {
>> @@ -186,11 +187,15 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>>   static void
>>   single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
>>   {
>> +       const unsigned int hang_us = 10e6;
>>          unsigned long slept;
>>          igt_spin_t *spin;
>> -       uint64_t val;
>> +       uint64_t val[2], ts[2];
>>          int fd;
>>   
>> +       if (flags & FLAG_HANG)
>> +               gem_quiescent_gpu(gem_fd);
>> +
>>          fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
>>   
>>          if (flags & TEST_BUSY)
>> @@ -198,17 +203,36 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
>>          else
>>                  spin = NULL;
>>   
>> -       val = pmu_read_single(fd);
>> -       slept = measured_usleep(batch_duration_ns / 1000);
>> +       val[0] = __pmu_read_single(fd, &ts[0]);
>> +       slept = measured_usleep(flags & FLAG_HANG ?
>> +                               hang_us : batch_duration_ns / 1000);
>>          if (flags & TEST_TRAILING_IDLE)
>>                  end_spin(gem_fd, spin, flags);
>> -       val = pmu_read_single(fd) - val;
>> +       val[1] = pmu_read_single(fd);
>>   
>>          end_spin(gem_fd, spin, FLAG_SYNC);
>>          igt_spin_batch_free(gem_fd, spin);
>> -       close(fd);
>>   
>> -       assert_within_epsilon(val, flags & TEST_BUSY ? slept : 0.f, tolerance);
>> +       if ((flags & TEST_BUSY) && (flags & FLAG_HANG)) {
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +               close(fd);
>> +               igt_info("sampled with hang %.3fms / %.3fms\n",
>> +                        (val[1] - val[0]) / 1e6, (ts[1] - ts[0]) / 1e6);
>> +               /* Check that some busyness was reported. */
>> +               igt_assert(val[1] - val[0] > 0);
>> +               /*
>> +                * But not more than some reasonable value before which we
>> +                * expected the spinner to be kicked out.
>> +                */
> 
> So 120s? And even that carries internal knowledge from across the ages.
> 
> I don't think this is a sensible test. What would be reasonable is
> something like
> 
> 	spinner()
> 	val[0] = pmu()
> 	sleep()
> 	igt_force_gpu_reset()
> 	val[1] = pmu();
> 	d_busy = val[1] - val[0]
> 	sleep()
> 	val[2] = pmu()
> 	d_idle = val[2] - val[1];
> 
> Then d_busy should be d_ts, and d_idle should be 0. i.e. the
> igt_force_gpu_reset() is just an indirect igt_spin_batch_end().

Yeah I am not claiming the test is great. I threw it together quickly 
when I suspected something is going bad. Just want to get some results 
overnight so I can despair tomorrow.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 7fab73e22c2d..90b6ec4db32d 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -168,6 +168,7 @@  static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
 #define TEST_TRAILING_IDLE (4)
 #define TEST_RUNTIME_PM (8)
 #define FLAG_LONG (16)
+#define FLAG_HANG (32)
 
 static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 {
@@ -186,11 +187,15 @@  static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
 static void
 single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 {
+	const unsigned int hang_us = 10e6;
 	unsigned long slept;
 	igt_spin_t *spin;
-	uint64_t val;
+	uint64_t val[2], ts[2];
 	int fd;
 
+	if (flags & FLAG_HANG)
+		gem_quiescent_gpu(gem_fd);
+
 	fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
 
 	if (flags & TEST_BUSY)
@@ -198,17 +203,36 @@  single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags)
 	else
 		spin = NULL;
 
-	val = pmu_read_single(fd);
-	slept = measured_usleep(batch_duration_ns / 1000);
+	val[0] = __pmu_read_single(fd, &ts[0]);
+	slept = measured_usleep(flags & FLAG_HANG ?
+				hang_us : batch_duration_ns / 1000);
 	if (flags & TEST_TRAILING_IDLE)
 		end_spin(gem_fd, spin, flags);
-	val = pmu_read_single(fd) - val;
+	val[1] = pmu_read_single(fd);
 
 	end_spin(gem_fd, spin, FLAG_SYNC);
 	igt_spin_batch_free(gem_fd, spin);
-	close(fd);
 
-	assert_within_epsilon(val, flags & TEST_BUSY ? slept : 0.f, tolerance);
+	if ((flags & TEST_BUSY) && (flags & FLAG_HANG)) {
+		val[1] = __pmu_read_single(fd, &ts[1]);
+		close(fd);
+		igt_info("sampled with hang %.3fms / %.3fms\n",
+			 (val[1] - val[0]) / 1e6, (ts[1] - ts[0]) / 1e6);
+		/* Check that some busyness was reported. */
+		igt_assert(val[1] - val[0] > 0);
+		/*
+		 * But not more than some reasonable value before which we
+		 * expected the spinner to be kicked out.
+		 */
+		igt_assert((val[1] - val[0]) / 1e3 < (double)hang_us * 0.75);
+		__assert_within_epsilon(val[1] - val[0], hang_us * 1e3,
+					0.02f, 10.0f);
+	} else {
+		close(fd);
+		assert_within_epsilon(val[1] - val[0],
+				      flags & TEST_BUSY ?
+				      slept : 0.f, tolerance);
+	}
 	gem_quiescent_gpu(gem_fd);
 }
 
@@ -1695,6 +1719,11 @@  igt_main
 					      pct[i], e->name)
 					accuracy(fd, e, pct[i]);
 			}
+
+			igt_subtest_f("busy-hang-%s", e->name) {
+				single(fd, e, TEST_BUSY | FLAG_HANG);
+				single(fd, e, TEST_BUSY | FLAG_HANG);
+			}
 		}
 
 		/**