diff mbox series

[libdrm] amdgpu/test: Add illegal register and memory access test.

Message ID 1541010808-1293-1-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series [libdrm] amdgpu/test: Add illegal register and memory access test. | expand

Commit Message

Andrey Grodzovsky Oct. 31, 2018, 6:33 p.m. UTC
Illegal access will cause CP hang followed by job timeout and
recovery kicking in.
Also, disable the suite for all APU ASICs until GPU
reset issues for them will be resolved and GPU reset recovery
will be enabled by default.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 1 deletion(-)

Comments

Alex Deucher Oct. 31, 2018, 7:49 p.m. UTC | #1
On Wed, Oct 31, 2018 at 2:33 PM Andrey Grodzovsky
<andrey.grodzovsky@amd.com> wrote:
>
> Illegal access will cause CP hang followed by job timeout and
> recovery kicking in.
> Also, disable the suite for all APU ASICs until GPU
> reset issues for them will be resolved and GPU reset recovery
> will be enabled by default.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>  tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 117 insertions(+), 1 deletion(-)
>
> diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
> index 292ec4e..c565f7a 100644
> --- a/tests/amdgpu/deadlock_tests.c
> +++ b/tests/amdgpu/deadlock_tests.c
> @@ -73,6 +73,29 @@
>                  * 1 - pfp
>                  */
>
> +#define        PACKET3_WRITE_DATA                              0x37
> +#define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
> +               /* 0 - register
> +                * 1 - memory (sync - via GRBM)
> +                * 2 - gl2
> +                * 3 - gds
> +                * 4 - reserved
> +                * 5 - memory (async - direct)
> +                */
> +#define                WR_ONE_ADDR                             (1 << 16)
> +#define                WR_CONFIRM                              (1 << 20)
> +#define                WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
> +               /* 0 - LRU
> +                * 1 - Stream
> +                */
> +#define                WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
> +               /* 0 - me
> +                * 1 - pfp
> +                * 2 - ce
> +                */
> +
> +#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
> +
>  static  amdgpu_device_handle device_handle;
>  static  uint32_t  major_version;
>  static  uint32_t  minor_version;
> @@ -85,6 +108,8 @@ int use_uc_mtype = 0;
>  static void amdgpu_deadlock_helper(unsigned ip_type);
>  static void amdgpu_deadlock_gfx(void);
>  static void amdgpu_deadlock_compute(void);
> +static void amdgpu_illegal_reg_access();
> +static void amdgpu_illegal_mem_access();
>
>  CU_BOOL suite_deadlock_tests_enable(void)
>  {
> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>                                              &minor_version, &device_handle))
>                 return CU_FALSE;
>
> -       if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
> +       if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||

Add AMDGPU_FAMILY_KV for CI based APUs as well.


> +                       device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
> +                       device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>                 printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>                 enable = CU_FALSE;
>         }
> @@ -140,6 +167,8 @@ int suite_deadlock_tests_clean(void)
>  CU_TestInfo deadlock_tests[] = {
>         { "gfx ring block test",  amdgpu_deadlock_gfx },
>         { "compute ring block test",  amdgpu_deadlock_compute },
> +       { "illegal reg access test",  amdgpu_illegal_reg_access },
> +       { "illegal mem access test",  amdgpu_illegal_mem_access },

Won't this illegal mem access just result in a page fault?  Is the
idea to set vm_debug to force an MC halt to test reset?

Alex

>         CU_TEST_INFO_NULL,
>  };
>
> @@ -257,3 +286,90 @@ static void amdgpu_deadlock_helper(unsigned ip_type)
>         r = amdgpu_cs_ctx_free(context_handle);
>         CU_ASSERT_EQUAL(r, 0);
>  }
> +
> +static void bad_access_helper(int reg_access)
> +{
> +       amdgpu_context_handle context_handle;
> +       amdgpu_bo_handle ib_result_handle;
> +       void *ib_result_cpu;
> +       uint64_t ib_result_mc_address;
> +       struct amdgpu_cs_request ibs_request;
> +       struct amdgpu_cs_ib_info ib_info;
> +       struct amdgpu_cs_fence fence_status;
> +       uint32_t expired;
> +       int i, r;
> +       amdgpu_bo_list_handle bo_list;
> +       amdgpu_va_handle va_handle;
> +
> +       r = amdgpu_cs_ctx_create(device_handle, &context_handle);
> +       CU_ASSERT_EQUAL(r, 0);
> +
> +       r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
> +                       AMDGPU_GEM_DOMAIN_GTT, 0, 0,
> +                                                       &ib_result_handle, &ib_result_cpu,
> +                                                       &ib_result_mc_address, &va_handle);
> +       CU_ASSERT_EQUAL(r, 0);
> +
> +       r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
> +                                  &bo_list);
> +       CU_ASSERT_EQUAL(r, 0);
> +
> +       ptr = ib_result_cpu;
> +       i = 0;
> +
> +       ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
> +       ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
> +       ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
> +       ptr[i++] = 0;
> +       ptr[i++] = 0xdeadbeef;
> +
> +       for (; i < 16; ++i)
> +               ptr[i] = 0xffff1000;
> +
> +       memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
> +       ib_info.ib_mc_address = ib_result_mc_address;
> +       ib_info.size = 16;
> +
> +       memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
> +       ibs_request.ip_type = AMDGPU_HW_IP_GFX;
> +       ibs_request.ring = 0;
> +       ibs_request.number_of_ibs = 1;
> +       ibs_request.ibs = &ib_info;
> +       ibs_request.resources = bo_list;
> +       ibs_request.fence_info.handle = NULL;
> +
> +       r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
> +
> +
> +       memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
> +       fence_status.context = context_handle;
> +       fence_status.ip_type = AMDGPU_HW_IP_GFX;
> +       fence_status.ip_instance = 0;
> +       fence_status.ring = 0;
> +       fence_status.fence = ibs_request.seq_no;
> +
> +       r = amdgpu_cs_query_fence_status(&fence_status,
> +                       AMDGPU_TIMEOUT_INFINITE,0, &expired);
> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
> +
> +       r = amdgpu_bo_list_destroy(bo_list);
> +       CU_ASSERT_EQUAL(r, 0);
> +
> +       r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
> +                                        ib_result_mc_address, 4096);
> +       CU_ASSERT_EQUAL(r, 0);
> +
> +       r = amdgpu_cs_ctx_free(context_handle);
> +       CU_ASSERT_EQUAL(r, 0);
> +}
> +
> +static void amdgpu_illegal_reg_access()
> +{
> +       bad_access_helper(1);
> +}
> +
> +static void amdgpu_illegal_mem_access()
> +{
> +       bad_access_helper(0);
> +}
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Andrey Grodzovsky Oct. 31, 2018, 8:04 p.m. UTC | #2
On 10/31/2018 03:49 PM, Alex Deucher wrote:
> On Wed, Oct 31, 2018 at 2:33 PM Andrey Grodzovsky
> <andrey.grodzovsky@amd.com> wrote:
>> Illegal access will cause CP hang followed by job timeout and
>> recovery kicking in.
>> Also, disable the suite for all APU ASICs until GPU
>> reset issues for them will be resolved and GPU reset recovery
>> will be enabled by default.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 117 insertions(+), 1 deletion(-)
>>
>> diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
>> index 292ec4e..c565f7a 100644
>> --- a/tests/amdgpu/deadlock_tests.c
>> +++ b/tests/amdgpu/deadlock_tests.c
>> @@ -73,6 +73,29 @@
>>                   * 1 - pfp
>>                   */
>>
>> +#define        PACKET3_WRITE_DATA                              0x37
>> +#define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
>> +               /* 0 - register
>> +                * 1 - memory (sync - via GRBM)
>> +                * 2 - gl2
>> +                * 3 - gds
>> +                * 4 - reserved
>> +                * 5 - memory (async - direct)
>> +                */
>> +#define                WR_ONE_ADDR                             (1 << 16)
>> +#define                WR_CONFIRM                              (1 << 20)
>> +#define                WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
>> +               /* 0 - LRU
>> +                * 1 - Stream
>> +                */
>> +#define                WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
>> +               /* 0 - me
>> +                * 1 - pfp
>> +                * 2 - ce
>> +                */
>> +
>> +#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
>> +
>>   static  amdgpu_device_handle device_handle;
>>   static  uint32_t  major_version;
>>   static  uint32_t  minor_version;
>> @@ -85,6 +108,8 @@ int use_uc_mtype = 0;
>>   static void amdgpu_deadlock_helper(unsigned ip_type);
>>   static void amdgpu_deadlock_gfx(void);
>>   static void amdgpu_deadlock_compute(void);
>> +static void amdgpu_illegal_reg_access();
>> +static void amdgpu_illegal_mem_access();
>>
>>   CU_BOOL suite_deadlock_tests_enable(void)
>>   {
>> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>>                                               &minor_version, &device_handle))
>>                  return CU_FALSE;
>>
>> -       if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
>> +       if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
> Add AMDGPU_FAMILY_KV for CI based APUs as well.
>
>
>> +                       device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
>> +                       device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>>                  printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>>                  enable = CU_FALSE;
>>          }
>> @@ -140,6 +167,8 @@ int suite_deadlock_tests_clean(void)
>>   CU_TestInfo deadlock_tests[] = {
>>          { "gfx ring block test",  amdgpu_deadlock_gfx },
>>          { "compute ring block test",  amdgpu_deadlock_compute },
>> +       { "illegal reg access test",  amdgpu_illegal_reg_access },
>> +       { "illegal mem access test",  amdgpu_illegal_mem_access },
> Won't this illegal mem access just result in a page fault?  Is the
> idea to set vm_debug to force an MC halt to test reset?
>
> Alex

For this once to hang to CP amdgpu.vm_fault_stop=2 needs to be set.

Andrey

>
>>          CU_TEST_INFO_NULL,
>>   };
>>
>> @@ -257,3 +286,90 @@ static void amdgpu_deadlock_helper(unsigned ip_type)
>>          r = amdgpu_cs_ctx_free(context_handle);
>>          CU_ASSERT_EQUAL(r, 0);
>>   }
>> +
>> +static void bad_access_helper(int reg_access)
>> +{
>> +       amdgpu_context_handle context_handle;
>> +       amdgpu_bo_handle ib_result_handle;
>> +       void *ib_result_cpu;
>> +       uint64_t ib_result_mc_address;
>> +       struct amdgpu_cs_request ibs_request;
>> +       struct amdgpu_cs_ib_info ib_info;
>> +       struct amdgpu_cs_fence fence_status;
>> +       uint32_t expired;
>> +       int i, r;
>> +       amdgpu_bo_list_handle bo_list;
>> +       amdgpu_va_handle va_handle;
>> +
>> +       r = amdgpu_cs_ctx_create(device_handle, &context_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
>> +                       AMDGPU_GEM_DOMAIN_GTT, 0, 0,
>> +                                                       &ib_result_handle, &ib_result_cpu,
>> +                                                       &ib_result_mc_address, &va_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
>> +                                  &bo_list);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       ptr = ib_result_cpu;
>> +       i = 0;
>> +
>> +       ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
>> +       ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
>> +       ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
>> +       ptr[i++] = 0;
>> +       ptr[i++] = 0xdeadbeef;
>> +
>> +       for (; i < 16; ++i)
>> +               ptr[i] = 0xffff1000;
>> +
>> +       memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
>> +       ib_info.ib_mc_address = ib_result_mc_address;
>> +       ib_info.size = 16;
>> +
>> +       memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
>> +       ibs_request.ip_type = AMDGPU_HW_IP_GFX;
>> +       ibs_request.ring = 0;
>> +       ibs_request.number_of_ibs = 1;
>> +       ibs_request.ibs = &ib_info;
>> +       ibs_request.resources = bo_list;
>> +       ibs_request.fence_info.handle = NULL;
>> +
>> +       r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>> +
>> +
>> +       memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
>> +       fence_status.context = context_handle;
>> +       fence_status.ip_type = AMDGPU_HW_IP_GFX;
>> +       fence_status.ip_instance = 0;
>> +       fence_status.ring = 0;
>> +       fence_status.fence = ibs_request.seq_no;
>> +
>> +       r = amdgpu_cs_query_fence_status(&fence_status,
>> +                       AMDGPU_TIMEOUT_INFINITE,0, &expired);
>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>> +
>> +       r = amdgpu_bo_list_destroy(bo_list);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
>> +                                        ib_result_mc_address, 4096);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_cs_ctx_free(context_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +}
>> +
>> +static void amdgpu_illegal_reg_access()
>> +{
>> +       bad_access_helper(1);
>> +}
>> +
>> +static void amdgpu_illegal_mem_access()
>> +{
>> +       bad_access_helper(0);
>> +}
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Andrey Grodzovsky Oct. 31, 2018, 8:05 p.m. UTC | #3
On 10/31/2018 03:49 PM, Alex Deucher wrote:
> On Wed, Oct 31, 2018 at 2:33 PM Andrey Grodzovsky
> <andrey.grodzovsky@amd.com> wrote:
>> Illegal access will cause CP hang followed by job timeout and
>> recovery kicking in.
>> Also, disable the suite for all APU ASICs until GPU
>> reset issues for them will be resolved and GPU reset recovery
>> will be enabled by default.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 117 insertions(+), 1 deletion(-)
>>
>> diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
>> index 292ec4e..c565f7a 100644
>> --- a/tests/amdgpu/deadlock_tests.c
>> +++ b/tests/amdgpu/deadlock_tests.c
>> @@ -73,6 +73,29 @@
>>                   * 1 - pfp
>>                   */
>>
>> +#define        PACKET3_WRITE_DATA                              0x37
>> +#define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
>> +               /* 0 - register
>> +                * 1 - memory (sync - via GRBM)
>> +                * 2 - gl2
>> +                * 3 - gds
>> +                * 4 - reserved
>> +                * 5 - memory (async - direct)
>> +                */
>> +#define                WR_ONE_ADDR                             (1 << 16)
>> +#define                WR_CONFIRM                              (1 << 20)
>> +#define                WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
>> +               /* 0 - LRU
>> +                * 1 - Stream
>> +                */
>> +#define                WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
>> +               /* 0 - me
>> +                * 1 - pfp
>> +                * 2 - ce
>> +                */
>> +
>> +#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
>> +
>>   static  amdgpu_device_handle device_handle;
>>   static  uint32_t  major_version;
>>   static  uint32_t  minor_version;
>> @@ -85,6 +108,8 @@ int use_uc_mtype = 0;
>>   static void amdgpu_deadlock_helper(unsigned ip_type);
>>   static void amdgpu_deadlock_gfx(void);
>>   static void amdgpu_deadlock_compute(void);
>> +static void amdgpu_illegal_reg_access();
>> +static void amdgpu_illegal_mem_access();
>>
>>   CU_BOOL suite_deadlock_tests_enable(void)
>>   {
>> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>>                                               &minor_version, &device_handle))
>>                  return CU_FALSE;
>>
>> -       if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
>> +       if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
> Add AMDGPU_FAMILY_KV for CI based APUs as well.
>
>
>> +                       device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
>> +                       device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>>                  printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>>                  enable = CU_FALSE;
>>          }
>> @@ -140,6 +167,8 @@ int suite_deadlock_tests_clean(void)
>>   CU_TestInfo deadlock_tests[] = {
>>          { "gfx ring block test",  amdgpu_deadlock_gfx },
>>          { "compute ring block test",  amdgpu_deadlock_compute },
>> +       { "illegal reg access test",  amdgpu_illegal_reg_access },
>> +       { "illegal mem access test",  amdgpu_illegal_mem_access },
> Won't this illegal mem access just result in a page fault?  Is the
> idea to set vm_debug to force an MC halt to test reset?
>
> Alex

For this test to hang the CP amdgpu.vm_fault_stop=2 needs to be set.

Andrey

>
>>          CU_TEST_INFO_NULL,
>>   };
>>
>> @@ -257,3 +286,90 @@ static void amdgpu_deadlock_helper(unsigned ip_type)
>>          r = amdgpu_cs_ctx_free(context_handle);
>>          CU_ASSERT_EQUAL(r, 0);
>>   }
>> +
>> +static void bad_access_helper(int reg_access)
>> +{
>> +       amdgpu_context_handle context_handle;
>> +       amdgpu_bo_handle ib_result_handle;
>> +       void *ib_result_cpu;
>> +       uint64_t ib_result_mc_address;
>> +       struct amdgpu_cs_request ibs_request;
>> +       struct amdgpu_cs_ib_info ib_info;
>> +       struct amdgpu_cs_fence fence_status;
>> +       uint32_t expired;
>> +       int i, r;
>> +       amdgpu_bo_list_handle bo_list;
>> +       amdgpu_va_handle va_handle;
>> +
>> +       r = amdgpu_cs_ctx_create(device_handle, &context_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
>> +                       AMDGPU_GEM_DOMAIN_GTT, 0, 0,
>> +                                                       &ib_result_handle, &ib_result_cpu,
>> +                                                       &ib_result_mc_address, &va_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
>> +                                  &bo_list);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       ptr = ib_result_cpu;
>> +       i = 0;
>> +
>> +       ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
>> +       ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
>> +       ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
>> +       ptr[i++] = 0;
>> +       ptr[i++] = 0xdeadbeef;
>> +
>> +       for (; i < 16; ++i)
>> +               ptr[i] = 0xffff1000;
>> +
>> +       memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
>> +       ib_info.ib_mc_address = ib_result_mc_address;
>> +       ib_info.size = 16;
>> +
>> +       memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
>> +       ibs_request.ip_type = AMDGPU_HW_IP_GFX;
>> +       ibs_request.ring = 0;
>> +       ibs_request.number_of_ibs = 1;
>> +       ibs_request.ibs = &ib_info;
>> +       ibs_request.resources = bo_list;
>> +       ibs_request.fence_info.handle = NULL;
>> +
>> +       r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>> +
>> +
>> +       memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
>> +       fence_status.context = context_handle;
>> +       fence_status.ip_type = AMDGPU_HW_IP_GFX;
>> +       fence_status.ip_instance = 0;
>> +       fence_status.ring = 0;
>> +       fence_status.fence = ibs_request.seq_no;
>> +
>> +       r = amdgpu_cs_query_fence_status(&fence_status,
>> +                       AMDGPU_TIMEOUT_INFINITE,0, &expired);
>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>> +
>> +       r = amdgpu_bo_list_destroy(bo_list);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
>> +                                        ib_result_mc_address, 4096);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +
>> +       r = amdgpu_cs_ctx_free(context_handle);
>> +       CU_ASSERT_EQUAL(r, 0);
>> +}
>> +
>> +static void amdgpu_illegal_reg_access()
>> +{
>> +       bad_access_helper(1);
>> +}
>> +
>> +static void amdgpu_illegal_mem_access()
>> +{
>> +       bad_access_helper(0);
>> +}
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Alex Deucher Nov. 1, 2018, 1:44 a.m. UTC | #4
On Wed, Oct 31, 2018 at 4:05 PM Grodzovsky, Andrey
<Andrey.Grodzovsky@amd.com> wrote:
>
>
>
> On 10/31/2018 03:49 PM, Alex Deucher wrote:
> > On Wed, Oct 31, 2018 at 2:33 PM Andrey Grodzovsky
> > <andrey.grodzovsky@amd.com> wrote:
> >> Illegal access will cause CP hang followed by job timeout and
> >> recovery kicking in.
> >> Also, disable the suite for all APU ASICs until GPU
> >> reset issues for them will be resolved and GPU reset recovery
> >> will be enabled by default.
> >>
> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> >> ---
> >>   tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 117 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
> >> index 292ec4e..c565f7a 100644
> >> --- a/tests/amdgpu/deadlock_tests.c
> >> +++ b/tests/amdgpu/deadlock_tests.c
> >> @@ -73,6 +73,29 @@
> >>                   * 1 - pfp
> >>                   */
> >>
> >> +#define        PACKET3_WRITE_DATA                              0x37
> >> +#define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
> >> +               /* 0 - register
> >> +                * 1 - memory (sync - via GRBM)
> >> +                * 2 - gl2
> >> +                * 3 - gds
> >> +                * 4 - reserved
> >> +                * 5 - memory (async - direct)
> >> +                */
> >> +#define                WR_ONE_ADDR                             (1 << 16)
> >> +#define                WR_CONFIRM                              (1 << 20)
> >> +#define                WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
> >> +               /* 0 - LRU
> >> +                * 1 - Stream
> >> +                */
> >> +#define                WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
> >> +               /* 0 - me
> >> +                * 1 - pfp
> >> +                * 2 - ce
> >> +                */
> >> +
> >> +#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
> >> +
> >>   static  amdgpu_device_handle device_handle;
> >>   static  uint32_t  major_version;
> >>   static  uint32_t  minor_version;
> >> @@ -85,6 +108,8 @@ int use_uc_mtype = 0;
> >>   static void amdgpu_deadlock_helper(unsigned ip_type);
> >>   static void amdgpu_deadlock_gfx(void);
> >>   static void amdgpu_deadlock_compute(void);
> >> +static void amdgpu_illegal_reg_access();
> >> +static void amdgpu_illegal_mem_access();
> >>
> >>   CU_BOOL suite_deadlock_tests_enable(void)
> >>   {
> >> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
> >>                                               &minor_version, &device_handle))
> >>                  return CU_FALSE;
> >>
> >> -       if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
> >> +       if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
> > Add AMDGPU_FAMILY_KV for CI based APUs as well.
> >
> >
> >> +                       device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
> >> +                       device_handle->info.family_id == AMDGPU_FAMILY_RV) {
> >>                  printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
> >>                  enable = CU_FALSE;
> >>          }
> >> @@ -140,6 +167,8 @@ int suite_deadlock_tests_clean(void)
> >>   CU_TestInfo deadlock_tests[] = {
> >>          { "gfx ring block test",  amdgpu_deadlock_gfx },
> >>          { "compute ring block test",  amdgpu_deadlock_compute },
> >> +       { "illegal reg access test",  amdgpu_illegal_reg_access },
> >> +       { "illegal mem access test",  amdgpu_illegal_mem_access },
> > Won't this illegal mem access just result in a page fault?  Is the
> > idea to set vm_debug to force an MC halt to test reset?
> >
> > Alex
>
> For this test to hang the CP amdgpu.vm_fault_stop=2 needs to be set.

With the KV added above and a comment about vm_fault_stop added, this patch is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

>
> Andrey
>
> >
> >>          CU_TEST_INFO_NULL,
> >>   };
> >>
> >> @@ -257,3 +286,90 @@ static void amdgpu_deadlock_helper(unsigned ip_type)
> >>          r = amdgpu_cs_ctx_free(context_handle);
> >>          CU_ASSERT_EQUAL(r, 0);
> >>   }
> >> +
> >> +static void bad_access_helper(int reg_access)
> >> +{
> >> +       amdgpu_context_handle context_handle;
> >> +       amdgpu_bo_handle ib_result_handle;
> >> +       void *ib_result_cpu;
> >> +       uint64_t ib_result_mc_address;
> >> +       struct amdgpu_cs_request ibs_request;
> >> +       struct amdgpu_cs_ib_info ib_info;
> >> +       struct amdgpu_cs_fence fence_status;
> >> +       uint32_t expired;
> >> +       int i, r;
> >> +       amdgpu_bo_list_handle bo_list;
> >> +       amdgpu_va_handle va_handle;
> >> +
> >> +       r = amdgpu_cs_ctx_create(device_handle, &context_handle);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +
> >> +       r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
> >> +                       AMDGPU_GEM_DOMAIN_GTT, 0, 0,
> >> +                                                       &ib_result_handle, &ib_result_cpu,
> >> +                                                       &ib_result_mc_address, &va_handle);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +
> >> +       r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
> >> +                                  &bo_list);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +
> >> +       ptr = ib_result_cpu;
> >> +       i = 0;
> >> +
> >> +       ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
> >> +       ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
> >> +       ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
> >> +       ptr[i++] = 0;
> >> +       ptr[i++] = 0xdeadbeef;
> >> +
> >> +       for (; i < 16; ++i)
> >> +               ptr[i] = 0xffff1000;
> >> +
> >> +       memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
> >> +       ib_info.ib_mc_address = ib_result_mc_address;
> >> +       ib_info.size = 16;
> >> +
> >> +       memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
> >> +       ibs_request.ip_type = AMDGPU_HW_IP_GFX;
> >> +       ibs_request.ring = 0;
> >> +       ibs_request.number_of_ibs = 1;
> >> +       ibs_request.ibs = &ib_info;
> >> +       ibs_request.resources = bo_list;
> >> +       ibs_request.fence_info.handle = NULL;
> >> +
> >> +       r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
> >> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
> >> +
> >> +
> >> +       memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
> >> +       fence_status.context = context_handle;
> >> +       fence_status.ip_type = AMDGPU_HW_IP_GFX;
> >> +       fence_status.ip_instance = 0;
> >> +       fence_status.ring = 0;
> >> +       fence_status.fence = ibs_request.seq_no;
> >> +
> >> +       r = amdgpu_cs_query_fence_status(&fence_status,
> >> +                       AMDGPU_TIMEOUT_INFINITE,0, &expired);
> >> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
> >> +
> >> +       r = amdgpu_bo_list_destroy(bo_list);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +
> >> +       r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
> >> +                                        ib_result_mc_address, 4096);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +
> >> +       r = amdgpu_cs_ctx_free(context_handle);
> >> +       CU_ASSERT_EQUAL(r, 0);
> >> +}
> >> +
> >> +static void amdgpu_illegal_reg_access()
> >> +{
> >> +       bad_access_helper(1);
> >> +}
> >> +
> >> +static void amdgpu_illegal_mem_access()
> >> +{
> >> +       bad_access_helper(0);
> >> +}
> >> --
> >> 2.7.4
> >>
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
Christian König Nov. 2, 2018, 7:35 a.m. UTC | #5
Am 01.11.18 um 02:44 schrieb Alex Deucher:
> On Wed, Oct 31, 2018 at 4:05 PM Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com> wrote:
>>
>>
>> On 10/31/2018 03:49 PM, Alex Deucher wrote:
>>> On Wed, Oct 31, 2018 at 2:33 PM Andrey Grodzovsky
>>> <andrey.grodzovsky@amd.com> wrote:
>>>> Illegal access will cause CP hang followed by job timeout and
>>>> recovery kicking in.
>>>> Also, disable the suite for all APU ASICs until GPU
>>>> reset issues for them will be resolved and GPU reset recovery
>>>> will be enabled by default.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>    tests/amdgpu/deadlock_tests.c | 118 +++++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 117 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
>>>> index 292ec4e..c565f7a 100644
>>>> --- a/tests/amdgpu/deadlock_tests.c
>>>> +++ b/tests/amdgpu/deadlock_tests.c
>>>> @@ -73,6 +73,29 @@
>>>>                    * 1 - pfp
>>>>                    */
>>>>
>>>> +#define        PACKET3_WRITE_DATA                              0x37
>>>> +#define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
>>>> +               /* 0 - register
>>>> +                * 1 - memory (sync - via GRBM)
>>>> +                * 2 - gl2
>>>> +                * 3 - gds
>>>> +                * 4 - reserved
>>>> +                * 5 - memory (async - direct)
>>>> +                */
>>>> +#define                WR_ONE_ADDR                             (1 << 16)
>>>> +#define                WR_CONFIRM                              (1 << 20)
>>>> +#define                WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
>>>> +               /* 0 - LRU
>>>> +                * 1 - Stream
>>>> +                */
>>>> +#define                WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
>>>> +               /* 0 - me
>>>> +                * 1 - pfp
>>>> +                * 2 - ce
>>>> +                */
>>>> +
>>>> +#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
>>>> +
>>>>    static  amdgpu_device_handle device_handle;
>>>>    static  uint32_t  major_version;
>>>>    static  uint32_t  minor_version;
>>>> @@ -85,6 +108,8 @@ int use_uc_mtype = 0;
>>>>    static void amdgpu_deadlock_helper(unsigned ip_type);
>>>>    static void amdgpu_deadlock_gfx(void);
>>>>    static void amdgpu_deadlock_compute(void);
>>>> +static void amdgpu_illegal_reg_access();
>>>> +static void amdgpu_illegal_mem_access();
>>>>
>>>>    CU_BOOL suite_deadlock_tests_enable(void)
>>>>    {
>>>> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>>>>                                                &minor_version, &device_handle))
>>>>                   return CU_FALSE;
>>>>
>>>> -       if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
>>>> +       if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
>>> Add AMDGPU_FAMILY_KV for CI based APUs as well.
>>>
>>>
>>>> +                       device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
>>>> +                       device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>>>>                   printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>>>>                   enable = CU_FALSE;
>>>>           }
>>>> @@ -140,6 +167,8 @@ int suite_deadlock_tests_clean(void)
>>>>    CU_TestInfo deadlock_tests[] = {
>>>>           { "gfx ring block test",  amdgpu_deadlock_gfx },
>>>>           { "compute ring block test",  amdgpu_deadlock_compute },
>>>> +       { "illegal reg access test",  amdgpu_illegal_reg_access },
>>>> +       { "illegal mem access test",  amdgpu_illegal_mem_access },
>>> Won't this illegal mem access just result in a page fault?  Is the
>>> idea to set vm_debug to force an MC halt to test reset?
>>>
>>> Alex
>> For this test to hang the CP amdgpu.vm_fault_stop=2 needs to be set.
> With the KV added above and a comment about vm_fault_stop added, this patch is:
> Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

Seconded, patch is Reviewed-by: Christian König 
<christian.koenig@amd.com> as well.

Christian.

>
>> Andrey
>>
>>>>           CU_TEST_INFO_NULL,
>>>>    };
>>>>
>>>> @@ -257,3 +286,90 @@ static void amdgpu_deadlock_helper(unsigned ip_type)
>>>>           r = amdgpu_cs_ctx_free(context_handle);
>>>>           CU_ASSERT_EQUAL(r, 0);
>>>>    }
>>>> +
>>>> +static void bad_access_helper(int reg_access)
>>>> +{
>>>> +       amdgpu_context_handle context_handle;
>>>> +       amdgpu_bo_handle ib_result_handle;
>>>> +       void *ib_result_cpu;
>>>> +       uint64_t ib_result_mc_address;
>>>> +       struct amdgpu_cs_request ibs_request;
>>>> +       struct amdgpu_cs_ib_info ib_info;
>>>> +       struct amdgpu_cs_fence fence_status;
>>>> +       uint32_t expired;
>>>> +       int i, r;
>>>> +       amdgpu_bo_list_handle bo_list;
>>>> +       amdgpu_va_handle va_handle;
>>>> +
>>>> +       r = amdgpu_cs_ctx_create(device_handle, &context_handle);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +
>>>> +       r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
>>>> +                       AMDGPU_GEM_DOMAIN_GTT, 0, 0,
>>>> +                                                       &ib_result_handle, &ib_result_cpu,
>>>> +                                                       &ib_result_mc_address, &va_handle);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +
>>>> +       r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
>>>> +                                  &bo_list);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +
>>>> +       ptr = ib_result_cpu;
>>>> +       i = 0;
>>>> +
>>>> +       ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
>>>> +       ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
>>>> +       ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
>>>> +       ptr[i++] = 0;
>>>> +       ptr[i++] = 0xdeadbeef;
>>>> +
>>>> +       for (; i < 16; ++i)
>>>> +               ptr[i] = 0xffff1000;
>>>> +
>>>> +       memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
>>>> +       ib_info.ib_mc_address = ib_result_mc_address;
>>>> +       ib_info.size = 16;
>>>> +
>>>> +       memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
>>>> +       ibs_request.ip_type = AMDGPU_HW_IP_GFX;
>>>> +       ibs_request.ring = 0;
>>>> +       ibs_request.number_of_ibs = 1;
>>>> +       ibs_request.ibs = &ib_info;
>>>> +       ibs_request.resources = bo_list;
>>>> +       ibs_request.fence_info.handle = NULL;
>>>> +
>>>> +       r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
>>>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>>>> +
>>>> +
>>>> +       memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
>>>> +       fence_status.context = context_handle;
>>>> +       fence_status.ip_type = AMDGPU_HW_IP_GFX;
>>>> +       fence_status.ip_instance = 0;
>>>> +       fence_status.ring = 0;
>>>> +       fence_status.fence = ibs_request.seq_no;
>>>> +
>>>> +       r = amdgpu_cs_query_fence_status(&fence_status,
>>>> +                       AMDGPU_TIMEOUT_INFINITE,0, &expired);
>>>> +       CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
>>>> +
>>>> +       r = amdgpu_bo_list_destroy(bo_list);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +
>>>> +       r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
>>>> +                                        ib_result_mc_address, 4096);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +
>>>> +       r = amdgpu_cs_ctx_free(context_handle);
>>>> +       CU_ASSERT_EQUAL(r, 0);
>>>> +}
>>>> +
>>>> +static void amdgpu_illegal_reg_access()
>>>> +{
>>>> +       bad_access_helper(1);
>>>> +}
>>>> +
>>>> +static void amdgpu_illegal_mem_access()
>>>> +{
>>>> +       bad_access_helper(0);
>>>> +}
>>>> --
>>>> 2.7.4
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Michel Dänzer Nov. 2, 2018, 2:24 p.m. UTC | #6
On 2018-10-31 7:33 p.m., Andrey Grodzovsky wrote:
> Illegal access will cause CP hang followed by job timeout and
> recovery kicking in.
> Also, disable the suite for all APU ASICs until GPU
> reset issues for them will be resolved and GPU reset recovery
> will be enabled by default.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> 
> [...]
>  
> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>  					     &minor_version, &device_handle))
>  		return CU_FALSE;
>  
> -	if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
> +	if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
> +			device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
> +			device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>  		printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>  		enable = CU_FALSE;
>  	}

Indentation is wrong here and in other places. The libdrm tree contains
configuration files for EditorConfig (https://editorconfig.org/); since
you're using Eclipse, https://github.com/ncjones/editorconfig-eclipse
should help.


I run amdgpu_test as part of my daily build/test script during lunch
break; when I came back today, I was greeted by a GFX hang of the
Bonaire in my development box due to this test. Please disable it for
all pre-GFX8 ASICs. Ideally, it should also check at runtime that GPU
recovery is actually enabled, as that still isn't the case by default
except with bleeding edge amdgpu kernel code.
Andrey Grodzovsky Nov. 2, 2018, 3:59 p.m. UTC | #7
On 11/02/2018 10:24 AM, Michel Dänzer wrote:
> On 2018-10-31 7:33 p.m., Andrey Grodzovsky wrote:
>> Illegal access will cause CP hang followed by job timeout and
>> recovery kicking in.
>> Also, disable the suite for all APU ASICs until GPU
>> reset issues for them will be resolved and GPU reset recovery
>> will be enabled by default.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>
>> [...]
>>   
>> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>>   					     &minor_version, &device_handle))
>>   		return CU_FALSE;
>>   
>> -	if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
>> +	if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
>> +			device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
>> +			device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>>   		printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>>   		enable = CU_FALSE;
>>   	}
> Indentation is wrong here and in other places. The libdrm tree contains
> configuration files for EditorConfig (https://editorconfig.org/); since
> you're using Eclipse, https://github.com/ncjones/editorconfig-eclipse
> should help.

I installed the eclipse plugin.
>
>
> I run amdgpu_test as part of my daily build/test script during lunch
> break; when I came back today, I was greeted by a GFX hang of the
> Bonaire in my development box due to this test. Please disable it for
> all pre-GFX8 ASICs. Ideally, it should also check at runtime that GPU
> recovery is actually enabled, as that still isn't the case by default
> except with bleeding edge amdgpu kernel code.
Thanks for testing - I will send a fix.

Andrey
>
>
Alex Deucher Nov. 2, 2018, 6:12 p.m. UTC | #8
On Fri, Nov 2, 2018 at 11:59 AM Grodzovsky, Andrey
<Andrey.Grodzovsky@amd.com> wrote:
>
>
>
> On 11/02/2018 10:24 AM, Michel Dänzer wrote:
> > On 2018-10-31 7:33 p.m., Andrey Grodzovsky wrote:
> >> Illegal access will cause CP hang followed by job timeout and
> >> recovery kicking in.
> >> Also, disable the suite for all APU ASICs until GPU
> >> reset issues for them will be resolved and GPU reset recovery
> >> will be enabled by default.
> >>
> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> >>
> >> [...]
> >>
> >> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
> >>                                           &minor_version, &device_handle))
> >>              return CU_FALSE;
> >>
> >> -    if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
> >> +    if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
> >> +                    device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
> >> +                    device_handle->info.family_id == AMDGPU_FAMILY_RV) {
> >>              printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
> >>              enable = CU_FALSE;
> >>      }
> > Indentation is wrong here and in other places. The libdrm tree contains
> > configuration files for EditorConfig (https://editorconfig.org/); since
> > you're using Eclipse, https://github.com/ncjones/editorconfig-eclipse
> > should help.
>
> I installed the eclipse plugin.
> >
> >
> > I run amdgpu_test as part of my daily build/test script during lunch
> > break; when I came back today, I was greeted by a GFX hang of the
> > Bonaire in my development box due to this test. Please disable it for
> > all pre-GFX8 ASICs. Ideally, it should also check at runtime that GPU
> > recovery is actually enabled, as that still isn't the case by default
> > except with bleeding edge amdgpu kernel code.
> Thanks for testing - I will send a fix.
>

Have you tried enabling reset on gfx7 dGPUs?  It uses pretty much the
same sequence as gfx8 so it might just work.

Alex

> Andrey
> >
> >
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Andrey Grodzovsky Nov. 2, 2018, 6:14 p.m. UTC | #9
On 11/02/2018 02:12 PM, Alex Deucher wrote:
> On Fri, Nov 2, 2018 at 11:59 AM Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com> wrote:
>>
>>
>> On 11/02/2018 10:24 AM, Michel Dänzer wrote:
>>> On 2018-10-31 7:33 p.m., Andrey Grodzovsky wrote:
>>>> Illegal access will cause CP hang followed by job timeout and
>>>> recovery kicking in.
>>>> Also, disable the suite for all APU ASICs until GPU
>>>> reset issues for them will be resolved and GPU reset recovery
>>>> will be enabled by default.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>
>>>> [...]
>>>>
>>>> @@ -94,7 +119,9 @@ CU_BOOL suite_deadlock_tests_enable(void)
>>>>                                            &minor_version, &device_handle))
>>>>               return CU_FALSE;
>>>>
>>>> -    if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
>>>> +    if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
>>>> +                    device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
>>>> +                    device_handle->info.family_id == AMDGPU_FAMILY_RV) {
>>>>               printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
>>>>               enable = CU_FALSE;
>>>>       }
>>> Indentation is wrong here and in other places. The libdrm tree contains
>>> configuration files for EditorConfig (https://editorconfig.org/); since
>>> you're using Eclipse, https://github.com/ncjones/editorconfig-eclipse
>>> should help.
>> I installed the eclipse plugin.
>>>
>>> I run amdgpu_test as part of my daily build/test script during lunch
>>> break; when I came back today, I was greeted by a GFX hang of the
>>> Bonaire in my development box due to this test. Please disable it for
>>> all pre-GFX8 ASICs. Ideally, it should also check at runtime that GPU
>>> recovery is actually enabled, as that still isn't the case by default
>>> except with bleeding edge amdgpu kernel code.
>> Thanks for testing - I will send a fix.
>>
> Have you tried enabling reset on gfx7 dGPUs?  It uses pretty much the
> same sequence as gfx8 so it might just work.
>
> Alex

I haven't but I can give it a try.

Andrey

>
>> Andrey
>>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Andrey Grodzovsky Nov. 2, 2018, 7:17 p.m. UTC | #10
There is a pplib messaging related failure currently during GPU reset. I will put this issue on my TODO

list for later time after handling more prioritized stuff and will disable the deadlock test suite for all non dGPU gfx8/9 ASICs until then.

Andrey

On 11/02/2018 02:14 PM, Grodzovsky, Andrey wrote:

Have you tried enabling reset on gfx7 dGPUs?  It uses pretty much the
same sequence as gfx8 so it might just work.

Alex


I haven't but I can give it a try.

Andrey
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body text="#000000" bgcolor="#FFFFFF">
<p>There is a pplib messaging related failure currently during GPU reset. I will put this issue on my TODO</p>
<p>list for later time after handling more prioritized stuff and will disable the deadlock test suite for all non dGPU gfx8/9 ASICs until then.</p>
<p>Andrey<br>
</p>
<br>
<div class="moz-cite-prefix">On 11/02/2018 02:14 PM, Grodzovsky, Andrey wrote:<br>
</div>
<blockquote type="cite" cite="mid:40e6d884-f03b-58f3-88b0-d22560812f50@amd.com">
<blockquote type="cite" style="color: #000000;">
<pre wrap="">Have you tried enabling reset on gfx7 dGPUs?  It uses pretty much the
same sequence as gfx8 so it might just work.

Alex
</pre>
</blockquote>
<pre wrap="">I haven't but I can give it a try.

Andrey

</pre>
</blockquote>
<br>
</body>
</html>
diff mbox series

Patch

diff --git a/tests/amdgpu/deadlock_tests.c b/tests/amdgpu/deadlock_tests.c
index 292ec4e..c565f7a 100644
--- a/tests/amdgpu/deadlock_tests.c
+++ b/tests/amdgpu/deadlock_tests.c
@@ -73,6 +73,29 @@ 
 		 * 1 - pfp
 		 */
 
+#define	PACKET3_WRITE_DATA				0x37
+#define		WRITE_DATA_DST_SEL(x)                   ((x) << 8)
+		/* 0 - register
+		 * 1 - memory (sync - via GRBM)
+		 * 2 - gl2
+		 * 3 - gds
+		 * 4 - reserved
+		 * 5 - memory (async - direct)
+		 */
+#define		WR_ONE_ADDR                             (1 << 16)
+#define		WR_CONFIRM                              (1 << 20)
+#define		WRITE_DATA_CACHE_POLICY(x)              ((x) << 25)
+		/* 0 - LRU
+		 * 1 - Stream
+		 */
+#define		WRITE_DATA_ENGINE_SEL(x)                ((x) << 30)
+		/* 0 - me
+		 * 1 - pfp
+		 * 2 - ce
+		 */
+
+#define mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR                                      0x54f
+
 static  amdgpu_device_handle device_handle;
 static  uint32_t  major_version;
 static  uint32_t  minor_version;
@@ -85,6 +108,8 @@  int use_uc_mtype = 0;
 static void amdgpu_deadlock_helper(unsigned ip_type);
 static void amdgpu_deadlock_gfx(void);
 static void amdgpu_deadlock_compute(void);
+static void amdgpu_illegal_reg_access();
+static void amdgpu_illegal_mem_access();
 
 CU_BOOL suite_deadlock_tests_enable(void)
 {
@@ -94,7 +119,9 @@  CU_BOOL suite_deadlock_tests_enable(void)
 					     &minor_version, &device_handle))
 		return CU_FALSE;
 
-	if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
+	if (device_handle->info.family_id == AMDGPU_FAMILY_SI ||
+			device_handle->info.family_id == AMDGPU_FAMILY_CZ ||
+			device_handle->info.family_id == AMDGPU_FAMILY_RV) {
 		printf("\n\nCurrently hangs the CP on this ASIC, deadlock suite disabled\n");
 		enable = CU_FALSE;
 	}
@@ -140,6 +167,8 @@  int suite_deadlock_tests_clean(void)
 CU_TestInfo deadlock_tests[] = {
 	{ "gfx ring block test",  amdgpu_deadlock_gfx },
 	{ "compute ring block test",  amdgpu_deadlock_compute },
+	{ "illegal reg access test",  amdgpu_illegal_reg_access },
+	{ "illegal mem access test",  amdgpu_illegal_mem_access },
 	CU_TEST_INFO_NULL,
 };
 
@@ -257,3 +286,90 @@  static void amdgpu_deadlock_helper(unsigned ip_type)
 	r = amdgpu_cs_ctx_free(context_handle);
 	CU_ASSERT_EQUAL(r, 0);
 }
+
+static void bad_access_helper(int reg_access)
+{
+	amdgpu_context_handle context_handle;
+	amdgpu_bo_handle ib_result_handle;
+	void *ib_result_cpu;
+	uint64_t ib_result_mc_address;
+	struct amdgpu_cs_request ibs_request;
+	struct amdgpu_cs_ib_info ib_info;
+	struct amdgpu_cs_fence fence_status;
+	uint32_t expired;
+	int i, r;
+	amdgpu_bo_list_handle bo_list;
+	amdgpu_va_handle va_handle;
+
+	r = amdgpu_cs_ctx_create(device_handle, &context_handle);
+	CU_ASSERT_EQUAL(r, 0);
+
+	r = amdgpu_bo_alloc_and_map_raw(device_handle, 4096, 4096,
+			AMDGPU_GEM_DOMAIN_GTT, 0, 0,
+							&ib_result_handle, &ib_result_cpu,
+							&ib_result_mc_address, &va_handle);
+	CU_ASSERT_EQUAL(r, 0);
+
+	r = amdgpu_get_bo_list(device_handle, ib_result_handle, NULL,
+				   &bo_list);
+	CU_ASSERT_EQUAL(r, 0);
+
+	ptr = ib_result_cpu;
+	i = 0;
+
+	ptr[i++] = PACKET3(PACKET3_WRITE_DATA, 3);
+	ptr[i++] = (reg_access ? WRITE_DATA_DST_SEL(0) : WRITE_DATA_DST_SEL(5))| WR_CONFIRM;
+	ptr[i++] = reg_access ? mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR : 0xdeadbee0;
+	ptr[i++] = 0;
+	ptr[i++] = 0xdeadbeef;
+
+	for (; i < 16; ++i)
+		ptr[i] = 0xffff1000;
+
+	memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
+	ib_info.ib_mc_address = ib_result_mc_address;
+	ib_info.size = 16;
+
+	memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
+	ibs_request.ip_type = AMDGPU_HW_IP_GFX;
+	ibs_request.ring = 0;
+	ibs_request.number_of_ibs = 1;
+	ibs_request.ibs = &ib_info;
+	ibs_request.resources = bo_list;
+	ibs_request.fence_info.handle = NULL;
+
+	r = amdgpu_cs_submit(context_handle, 0,&ibs_request, 1);
+	CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
+
+
+	memset(&fence_status, 0, sizeof(struct amdgpu_cs_fence));
+	fence_status.context = context_handle;
+	fence_status.ip_type = AMDGPU_HW_IP_GFX;
+	fence_status.ip_instance = 0;
+	fence_status.ring = 0;
+	fence_status.fence = ibs_request.seq_no;
+
+	r = amdgpu_cs_query_fence_status(&fence_status,
+			AMDGPU_TIMEOUT_INFINITE,0, &expired);
+	CU_ASSERT_EQUAL((r == 0 || r == -ECANCELED), 1);
+
+	r = amdgpu_bo_list_destroy(bo_list);
+	CU_ASSERT_EQUAL(r, 0);
+
+	r = amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
+					 ib_result_mc_address, 4096);
+	CU_ASSERT_EQUAL(r, 0);
+
+	r = amdgpu_cs_ctx_free(context_handle);
+	CU_ASSERT_EQUAL(r, 0);
+}
+
+static void amdgpu_illegal_reg_access()
+{
+	bad_access_helper(1);
+}
+
+static void amdgpu_illegal_mem_access()
+{
+	bad_access_helper(0);
+}