diff mbox series

[PULL,28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test

Message ID 1635698589-31849-29-git-send-email-tsimpson@quicinc.com (mailing list archive)
State New, archived
Headers show
Series [PULL,01/30] Hexagon HVX (target/hexagon) README | expand

Commit Message

Taylor Simpson Oct. 31, 2021, 4:43 p.m. UTC
Tests for
    packet semantics
    vector loads (aligned and unaligned)
    vector stores (aligned and unaligned)
    vector masked stores
    vector new value store
    maximum HVX temps in a packet
    vector operations

Acked-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
---
 tests/tcg/hexagon/hvx_misc.c      | 469 ++++++++++++++++++++++++++++++++++++++
 tests/tcg/hexagon/Makefile.target |   2 +
 2 files changed, 471 insertions(+)
 create mode 100644 tests/tcg/hexagon/hvx_misc.c

Comments

Philippe Mathieu-Daudé Nov. 1, 2021, 10:33 a.m. UTC | #1
On 10/31/21 17:43, Taylor Simpson wrote:
> Tests for
>     packet semantics
>     vector loads (aligned and unaligned)
>     vector stores (aligned and unaligned)
>     vector masked stores
>     vector new value store
>     maximum HVX temps in a packet
>     vector operations
> 
> Acked-by: Richard Henderson <richard.henderson@linaro.org>
> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
> ---
>  tests/tcg/hexagon/hvx_misc.c      | 469 ++++++++++++++++++++++++++++++++++++++
>  tests/tcg/hexagon/Makefile.target |   2 +
>  2 files changed, 471 insertions(+)
>  create mode 100644 tests/tcg/hexagon/hvx_misc.c

> +static void test_load_tmp(void)
> +{
> +    void *p0 = buffer0;
> +    void *p1 = buffer1;
> +    void *pout = output;
> +
> +    for (int i = 0; i < BUFSIZE; i++) {
> +        /*
> +         * Load into v12 as .tmp, then use it in the next packet
> +         * Should get the new value within the same packet and
> +         * the old value in the next packet
> +         */
> +        asm("v3 = vmem(%0 + #0)\n\t"
> +            "r1 = #1\n\t"
> +            "v12 = vsplat(r1)\n\t"
> +            "{\n\t"
> +            "    v12.tmp = vmem(%1 + #0)\n\t"
> +            "    v4.w = vadd(v12.w, v3.w)\n\t"
> +            "}\n\t"
> +            "v4.w = vadd(v4.w, v12.w)\n\t"
> +            "vmem(%2 + #0) = v4\n\t"
> +            : : "r"(p0), "r"(p1), "r"(pout)
> +            : "r1", "v12", "v3", "v4", "v6", "memory");
> +        p0 += sizeof(MMVector);
> +        p1 += sizeof(MMVector);
> +        pout += sizeof(MMVector);
> +
> +        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
> +            expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
> +        }
> +    }
> +
> +    check_output_w(__LINE__, BUFSIZE);
> +}

This test fails guest-tests:

tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12'
in asm
            : "r1", "v12", "v3", "v4", "v6", "memory");
                    ^
tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro 'TEST_VEC_OP2'
        VEC_OP2(ASM, EL, p0, p1, pout); \
        ^
Richard Henderson Nov. 1, 2021, 1:43 p.m. UTC | #2
On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote:
> On 10/31/21 17:43, Taylor Simpson wrote:
>> Tests for
>>      packet semantics
>>      vector loads (aligned and unaligned)
>>      vector stores (aligned and unaligned)
>>      vector masked stores
>>      vector new value store
>>      maximum HVX temps in a packet
>>      vector operations
>>
>> Acked-by: Richard Henderson <richard.henderson@linaro.org>
>> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
>> ---
>>   tests/tcg/hexagon/hvx_misc.c      | 469 ++++++++++++++++++++++++++++++++++++++
>>   tests/tcg/hexagon/Makefile.target |   2 +
>>   2 files changed, 471 insertions(+)
>>   create mode 100644 tests/tcg/hexagon/hvx_misc.c
> 
>> +static void test_load_tmp(void)
>> +{
>> +    void *p0 = buffer0;
>> +    void *p1 = buffer1;
>> +    void *pout = output;
>> +
>> +    for (int i = 0; i < BUFSIZE; i++) {
>> +        /*
>> +         * Load into v12 as .tmp, then use it in the next packet
>> +         * Should get the new value within the same packet and
>> +         * the old value in the next packet
>> +         */
>> +        asm("v3 = vmem(%0 + #0)\n\t"
>> +            "r1 = #1\n\t"
>> +            "v12 = vsplat(r1)\n\t"
>> +            "{\n\t"
>> +            "    v12.tmp = vmem(%1 + #0)\n\t"
>> +            "    v4.w = vadd(v12.w, v3.w)\n\t"
>> +            "}\n\t"
>> +            "v4.w = vadd(v4.w, v12.w)\n\t"
>> +            "vmem(%2 + #0) = v4\n\t"
>> +            : : "r"(p0), "r"(p1), "r"(pout)
>> +            : "r1", "v12", "v3", "v4", "v6", "memory");
>> +        p0 += sizeof(MMVector);
>> +        p1 += sizeof(MMVector);
>> +        pout += sizeof(MMVector);
>> +
>> +        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
>> +            expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
>> +        }
>> +    }
>> +
>> +    check_output_w(__LINE__, BUFSIZE);
>> +}
> 
> This test fails guest-tests:
> 
> tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12'
> in asm
>              : "r1", "v12", "v3", "v4", "v6", "memory");
>                      ^
> tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro 'TEST_VEC_OP2'
>          VEC_OP2(ASM, EL, p0, p1, pout); \
>          ^
> 

Yep, this PR depends on the toolchain update that's pending.
I'll ask Alex to hurry up the docker patch queue.


r~
Taylor Simpson Nov. 1, 2021, 2:09 p.m. UTC | #3
Plan-A should be to update the container with the new toolchain.

Plan-B would be to remove the vector registers from the inline asm.


Thanks!
Taylor


> -----Original Message-----
> From: Richard Henderson <richard.henderson@linaro.org>
> Sent: Monday, November 1, 2021 8:44 AM
> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
> <alex.bennee@linaro.org>
> Cc: peter.maydell@linaro.org
> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote:
> > On 10/31/21 17:43, Taylor Simpson wrote:
> >> Tests for
> >>      packet semantics
> >>      vector loads (aligned and unaligned)
> >>      vector stores (aligned and unaligned)
> >>      vector masked stores
> >>      vector new value store
> >>      maximum HVX temps in a packet
> >>      vector operations
> >>
> >> Acked-by: Richard Henderson <richard.henderson@linaro.org>
> >> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
> >> ---
> >>   tests/tcg/hexagon/hvx_misc.c      | 469
> ++++++++++++++++++++++++++++++++++++++
> >>   tests/tcg/hexagon/Makefile.target |   2 +
> >>   2 files changed, 471 insertions(+)
> >>   create mode 100644 tests/tcg/hexagon/hvx_misc.c
> >
> >> +static void test_load_tmp(void)
> >> +{
> >> +    void *p0 = buffer0;
> >> +    void *p1 = buffer1;
> >> +    void *pout = output;
> >> +
> >> +    for (int i = 0; i < BUFSIZE; i++) {
> >> +        /*
> >> +         * Load into v12 as .tmp, then use it in the next packet
> >> +         * Should get the new value within the same packet and
> >> +         * the old value in the next packet
> >> +         */
> >> +        asm("v3 = vmem(%0 + #0)\n\t"
> >> +            "r1 = #1\n\t"
> >> +            "v12 = vsplat(r1)\n\t"
> >> +            "{\n\t"
> >> +            "    v12.tmp = vmem(%1 + #0)\n\t"
> >> +            "    v4.w = vadd(v12.w, v3.w)\n\t"
> >> +            "}\n\t"
> >> +            "v4.w = vadd(v4.w, v12.w)\n\t"
> >> +            "vmem(%2 + #0) = v4\n\t"
> >> +            : : "r"(p0), "r"(p1), "r"(pout)
> >> +            : "r1", "v12", "v3", "v4", "v6", "memory");
> >> +        p0 += sizeof(MMVector);
> >> +        p1 += sizeof(MMVector);
> >> +        pout += sizeof(MMVector);
> >> +
> >> +        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
> >> +            expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
> >> +        }
> >> +    }
> >> +
> >> +    check_output_w(__LINE__, BUFSIZE); }
> >
> > This test fails guest-tests:
> >
> > tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12'
> > in asm
> >              : "r1", "v12", "v3", "v4", "v6", "memory");
> >                      ^
> > tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro
> 'TEST_VEC_OP2'
> >          VEC_OP2(ASM, EL, p0, p1, pout); \
> >          ^
> >
> 
> Yep, this PR depends on the toolchain update that's pending.
> I'll ask Alex to hurry up the docker patch queue.
> 
> 
> r~
Philippe Mathieu-Daudé Nov. 1, 2021, 2:17 p.m. UTC | #4
On 11/1/21 15:09, Taylor Simpson wrote:
> Plan-A should be to update the container with the new toolchain.

IIUC Richard is going with Plan-A: wait Alex queue get merged,
then retry merging this pull request.

> Plan-B would be to remove the vector registers from the inline asm.
> 
> 
> Thanks!
> Taylor
> 
> 
>> -----Original Message-----
>> From: Richard Henderson <richard.henderson@linaro.org>
>> Sent: Monday, November 1, 2021 8:44 AM
>> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
>> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
>> <alex.bennee@linaro.org>
>> Cc: peter.maydell@linaro.org
>> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
>>
>> WARNING: This email originated from outside of Qualcomm. Please be wary
>> of any links or attachments, and do not enable macros.
>>
>> On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote:
>>> On 10/31/21 17:43, Taylor Simpson wrote:
>>>> Tests for
>>>>      packet semantics
>>>>      vector loads (aligned and unaligned)
>>>>      vector stores (aligned and unaligned)
>>>>      vector masked stores
>>>>      vector new value store
>>>>      maximum HVX temps in a packet
>>>>      vector operations
>>>>
>>>> Acked-by: Richard Henderson <richard.henderson@linaro.org>
>>>> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
>>>> ---
>>>>   tests/tcg/hexagon/hvx_misc.c      | 469
>> ++++++++++++++++++++++++++++++++++++++
>>>>   tests/tcg/hexagon/Makefile.target |   2 +
>>>>   2 files changed, 471 insertions(+)
>>>>   create mode 100644 tests/tcg/hexagon/hvx_misc.c
>>>
>>>> +static void test_load_tmp(void)
>>>> +{
>>>> +    void *p0 = buffer0;
>>>> +    void *p1 = buffer1;
>>>> +    void *pout = output;
>>>> +
>>>> +    for (int i = 0; i < BUFSIZE; i++) {
>>>> +        /*
>>>> +         * Load into v12 as .tmp, then use it in the next packet
>>>> +         * Should get the new value within the same packet and
>>>> +         * the old value in the next packet
>>>> +         */
>>>> +        asm("v3 = vmem(%0 + #0)\n\t"
>>>> +            "r1 = #1\n\t"
>>>> +            "v12 = vsplat(r1)\n\t"
>>>> +            "{\n\t"
>>>> +            "    v12.tmp = vmem(%1 + #0)\n\t"
>>>> +            "    v4.w = vadd(v12.w, v3.w)\n\t"
>>>> +            "}\n\t"
>>>> +            "v4.w = vadd(v4.w, v12.w)\n\t"
>>>> +            "vmem(%2 + #0) = v4\n\t"
>>>> +            : : "r"(p0), "r"(p1), "r"(pout)
>>>> +            : "r1", "v12", "v3", "v4", "v6", "memory");
>>>> +        p0 += sizeof(MMVector);
>>>> +        p1 += sizeof(MMVector);
>>>> +        pout += sizeof(MMVector);
>>>> +
>>>> +        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
>>>> +            expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    check_output_w(__LINE__, BUFSIZE); }
>>>
>>> This test fails guest-tests:
>>>
>>> tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12'
>>> in asm
>>>              : "r1", "v12", "v3", "v4", "v6", "memory");
>>>                      ^
>>> tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro
>> 'TEST_VEC_OP2'
>>>          VEC_OP2(ASM, EL, p0, p1, pout); \
>>>          ^
>>>
>>
>> Yep, this PR depends on the toolchain update that's pending.
>> I'll ask Alex to hurry up the docker patch queue.
>>
>>
>> r~
Richard Henderson Nov. 1, 2021, 3:02 p.m. UTC | #5
On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote:
> On 11/1/21 15:09, Taylor Simpson wrote:
>> Plan-A should be to update the container with the new toolchain.
> 
> IIUC Richard is going with Plan-A: wait Alex queue get merged,
> then retry merging this pull request.

Correct.


r~
Taylor Simpson Nov. 2, 2021, 4:05 p.m. UTC | #6
> -----Original Message-----
> From: Richard Henderson <richard.henderson@linaro.org>
> Sent: Monday, November 1, 2021 10:03 AM
> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
> <alex.bennee@linaro.org>
> Cc: peter.maydell@linaro.org
> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
> 
> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote:
> > On 11/1/21 15:09, Taylor Simpson wrote:
> >> Plan-A should be to update the container with the new toolchain.
> >
> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then
> > retry merging this pull request.
> 
> Correct.

Agreed.  Just let me know if Alex isn't going to get the new toolchain merged in time, and I can go to plan B.

Thanks,
Taylor
Alex Bennée Nov. 2, 2021, 4:41 p.m. UTC | #7
Taylor Simpson <tsimpson@quicinc.com> writes:

>> -----Original Message-----
>> From: Richard Henderson <richard.henderson@linaro.org>
>> Sent: Monday, November 1, 2021 10:03 AM
>> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
>> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
>> <alex.bennee@linaro.org>
>> Cc: peter.maydell@linaro.org
>> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
>> 
>> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote:
>> > On 11/1/21 15:09, Taylor Simpson wrote:
>> >> Plan-A should be to update the container with the new toolchain.
>> >
>> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then
>> > retry merging this pull request.
>> 
>> Correct.
>
> Agreed.  Just let me know if Alex isn't going to get the new toolchain
> merged in time, and I can go to plan B.

The PR has been a pain to get working but it should be in soon. Just
need to work out why the signals test has broken between master and my
PR:

 https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318

vs

 https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316

Could it be a toolchain thing?

>
> Thanks,
> Taylor
Taylor Simpson Nov. 2, 2021, 4:53 p.m. UTC | #8
> -----Original Message-----
> From: Alex Bennée <alex.bennee@linaro.org>
> Sent: Tuesday, November 2, 2021 11:42 AM
> To: Taylor Simpson <tsimpson@quicinc.com>
> Cc: Richard Henderson <richard.henderson@linaro.org>; Philippe Mathieu-
> Daudé <f4bug@amsat.org>; qemu-devel@nongnu.org;
> peter.maydell@linaro.org
> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
> 
> Taylor Simpson <tsimpson@quicinc.com> writes:
> 
> >> -----Original Message-----
> >> From: Richard Henderson <richard.henderson@linaro.org>
> >> Sent: Monday, November 1, 2021 10:03 AM
> >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
> >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
> >> <alex.bennee@linaro.org>
> >> Cc: peter.maydell@linaro.org
> >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc
> >> test
> >>
> >> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote:
> >> > On 11/1/21 15:09, Taylor Simpson wrote:
> >> >> Plan-A should be to update the container with the new toolchain.
> >> >
> >> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then
> >> > retry merging this pull request.
> >>
> >> Correct.
> >
> > Agreed.  Just let me know if Alex isn't going to get the new toolchain
> > merged in time, and I can go to plan B.
> 
> The PR has been a pain to get working but it should be in soon. Just need to
> work out why the signals test has broken between master and my
> PR:
> 
>  https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318
> 
> vs
> 
>  https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316
> 
> Could it be a toolchain thing?

Not likely a toolchain problem.  If I can access both of the signals binaries, I can confirm.

Richard was doing some changes in qemu related to signals.  Are his changes available in both repos?


Taylor
Alex Bennée Nov. 3, 2021, 1:31 p.m. UTC | #9
Taylor Simpson <tsimpson@quicinc.com> writes:

>> -----Original Message-----
>> From: Alex Bennée <alex.bennee@linaro.org>
>> Sent: Tuesday, November 2, 2021 11:42 AM
>> To: Taylor Simpson <tsimpson@quicinc.com>
>> Cc: Richard Henderson <richard.henderson@linaro.org>; Philippe Mathieu-
>> Daudé <f4bug@amsat.org>; qemu-devel@nongnu.org;
>> peter.maydell@linaro.org
>> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test
>> 
>> Taylor Simpson <tsimpson@quicinc.com> writes:
>> 
>> >> -----Original Message-----
>> >> From: Richard Henderson <richard.henderson@linaro.org>
>> >> Sent: Monday, November 1, 2021 10:03 AM
>> >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson
>> >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée
>> >> <alex.bennee@linaro.org>
>> >> Cc: peter.maydell@linaro.org
>> >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc
>> >> test
>> >>
>> >> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote:
>> >> > On 11/1/21 15:09, Taylor Simpson wrote:
>> >> >> Plan-A should be to update the container with the new toolchain.
>> >> >
>> >> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then
>> >> > retry merging this pull request.
>> >>
>> >> Correct.
>> >
>> > Agreed.  Just let me know if Alex isn't going to get the new toolchain
>> > merged in time, and I can go to plan B.
>> 
>> The PR has been a pain to get working but it should be in soon. Just need to
>> work out why the signals test has broken between master and my
>> PR:
>> 
>>  https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318
>> 
>> vs
>> 
>>  https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316
>> 
>> Could it be a toolchain thing?
>
> Not likely a toolchain problem.  If I can access both of the signals
> binaries, I can confirm.

Testing against two signals binaries I see a 4-7% failure rate against the
new binary versus the original pre-toolchain change one. That's not to
say the binary is broken - it could be a subtle change that exacerbated
our existing poor signals support. 

  https://transfer.sh/xA2ejk/signals.old (pre-toolchain change)
  https://transfer.sh/vSsn5s/signals

something in the CI ensures it fails much more reliably as U can't get
it to pass on a retry.

>
> Richard was doing some changes in qemu related to signals.  Are his
> changes available in both repos?

I've tested against master and rth/tgt-next (742f07628c0)

>
>
> Taylor
Richard Henderson Nov. 3, 2021, 3:22 p.m. UTC | #10
On 11/3/21 9:31 AM, Alex Bennée wrote:
>>> Could it be a toolchain thing?
>>
>> Not likely a toolchain problem.  If I can access both of the signals
>> binaries, I can confirm.
> 
> Testing against two signals binaries I see a 4-7% failure rate against the
> new binary versus the original pre-toolchain change one. That's not to
> say the binary is broken - it could be a subtle change that exacerbated
> our existing poor signals support.
> 
>    https://transfer.sh/xA2ejk/signals.old (pre-toolchain change)
>    https://transfer.sh/vSsn5s/signals
> 
> something in the CI ensures it fails much more reliably as U can't get
> it to pass on a retry.

I've had a closer look at the signals failure, and it really could be a toolchain problem.

The sigsegv is at

#0  0x00005555557a387f in stb_p (ptr=0x10000, v=0 '\000')
     at /home/richard.henderson/qemu/src/include/qemu/bswap.h:326
#1  0x00005555557a4bc5 in cpu_stb_mmu (env=0x555555e4eb50, addr=0,
     val=0 '\000', oi=0, ra=93824992935986) at ../src/accel/tcg/user-exec.c:359
#2  0x00005555557a5396 in cpu_stb_mmuidx_ra (env=0x555555e4eb50, addr=0,
     val=0, mmu_idx=0, ra=93824992935986)
     at ../src/accel/tcg/ldst_common.c.inc:83
#3  0x00005555557a57e6 in cpu_stb_data_ra (env=0x555555e4eb50, addr=0, val=0,
     ra=93824992935986) at ../src/accel/tcg/ldst_common.c.inc:183
#4  0x00005555555ff6f0 in helper_commit_store (env=0x555555e4eb50, slot_num=1)
     at ../src/target/hexagon/op_helper.c:151
#5  0x0000555555600032 in check_noshuf (env=0x555555e4eb50, slot=0)
     at ../src/target/hexagon/op_helper.c:407
#6  0x00005555556000e4 in mem_load4 (env=0x555555e4eb50, slot=0, vaddr=305000)
     at ../src/target/hexagon/op_helper.c:431
#7  0x00005555556063c0 in helper_L2_loadri_io (env=0x555555e4eb50, RsV=305000,
     siV=0, slot=0) at target/hexagon/helper_funcs_generated.c.inc:1013
#8  0x00007fffe8034f5a in code_gen_buffer ()

which is a store to address 0, which obviously should fail.

This comes from

IN: nontrivial_free
0x000224c4:  0x78004003 {       R3 = #0x0
0x000224c8:  0xf204d001         P1 = cmp.eq(R4,R16) }
0x000224cc:  0x5c00413e {       if (P1) jump:nt PC+124
0x000224d0:  0x38034000         if (P0) memb(R3+#0x0) = #0x0
0x000224d4:  0x9180c002         R2 = memw(R0+#0x0) }

which is part of the new toolchain's libc.  This is quite obviously a store to address 0 
if P0 is true.  Which looks pretty questionable.  Presumably P0 is not always set, which 
is why the program does not always crash.  But there doesn't appear to be anything wrong 
with the qemu translation.

I'm suspicious of the new compiler.  This looks like some sort of code scheduling bug, 
where R3=0 got moved ahead of the final use of the previous value in R3.

In the short term, I recommend dropping the hexagon toolchain update and that Taylor 
generate a new HVX pull request with the new tests present but disabled in the makefile.


r~
diff mbox series

Patch

diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
new file mode 100644
index 0000000..312bb98
--- /dev/null
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -0,0 +1,469 @@ 
+/*
+ *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+
+int err;
+
+static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
+{
+    if (result != expect) {
+        printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
+               line, i, j, result, expect);
+        err++;
+    }
+}
+
+#define check(RES, EXP) __check(__LINE__, RES, EXP)
+
+#define MAX_VEC_SIZE_BYTES         128
+
+typedef union {
+    uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
+    int64_t   d[MAX_VEC_SIZE_BYTES / 8];
+    uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
+    int32_t   w[MAX_VEC_SIZE_BYTES / 4];
+    uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
+    int16_t   h[MAX_VEC_SIZE_BYTES / 2];
+    uint8_t  ub[MAX_VEC_SIZE_BYTES / 1];
+    int8_t    b[MAX_VEC_SIZE_BYTES / 1];
+} MMVector;
+
+#define BUFSIZE      16
+#define OUTSIZE      16
+#define MASKMOD      3
+
+MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+
+#define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
+static void check_output_##FIELD(int line, size_t num_vectors) \
+{ \
+    for (int i = 0; i < num_vectors; i++) { \
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
+            __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
+        } \
+    } \
+}
+
+CHECK_OUTPUT_FUNC(d,  8)
+CHECK_OUTPUT_FUNC(w,  4)
+CHECK_OUTPUT_FUNC(h,  2)
+CHECK_OUTPUT_FUNC(b,  1)
+
+static void init_buffers(void)
+{
+    int counter0 = 0;
+    int counter1 = 17;
+    for (int i = 0; i < BUFSIZE; i++) {
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
+            buffer0[i].b[j] = counter0++;
+            buffer1[i].b[j] = counter1++;
+        }
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
+        }
+    }
+}
+
+static void test_load_tmp(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        /*
+         * Load into v12 as .tmp, then use it in the next packet
+         * Should get the new value within the same packet and
+         * the old value in the next packet
+         */
+        asm("v3 = vmem(%0 + #0)\n\t"
+            "r1 = #1\n\t"
+            "v12 = vsplat(r1)\n\t"
+            "{\n\t"
+            "    v12.tmp = vmem(%1 + #0)\n\t"
+            "    v4.w = vadd(v12.w, v3.w)\n\t"
+            "}\n\t"
+            "v4.w = vadd(v4.w, v12.w)\n\t"
+            "vmem(%2 + #0) = v4\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "r1", "v12", "v3", "v4", "v6", "memory");
+        p0 += sizeof(MMVector);
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+static void test_load_cur(void)
+{
+    void *p0 = buffer0;
+    void *pout = output;
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        asm("{\n\t"
+            "    v2.cur = vmem(%0 + #0)\n\t"
+            "    vmem(%1 + #0) = v2\n\t"
+            "}\n\t"
+            : : "r"(p0), "r"(pout) : "v2", "memory");
+        p0 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].uw[j] = buffer0[i].uw[j];
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+static void test_load_aligned(void)
+{
+    /* Aligned loads ignore the low bits of the address */
+    void *p0 = buffer0;
+    void *pout = output;
+    const size_t offset = 13;
+
+    p0 += offset;    /* Create an unaligned address */
+    asm("v2 = vmem(%0 + #0)\n\t"
+        "vmem(%1 + #0) = v2\n\t"
+        : : "r"(p0), "r"(pout) : "v2", "memory");
+
+    expect[0] = buffer0[0];
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_load_unaligned(void)
+{
+    void *p0 = buffer0;
+    void *pout = output;
+    const size_t offset = 12;
+
+    p0 += offset;    /* Create an unaligned address */
+    asm("v2 = vmemu(%0 + #0)\n\t"
+        "vmem(%1 + #0) = v2\n\t"
+        : : "r"(p0), "r"(pout) : "v2", "memory");
+
+    memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_store_aligned(void)
+{
+    /* Aligned stores ignore the low bits of the address */
+    void *p0 = buffer0;
+    void *pout = output;
+    const size_t offset = 13;
+
+    pout += offset;    /* Create an unaligned address */
+    asm("v2 = vmem(%0 + #0)\n\t"
+        "vmem(%1 + #0) = v2\n\t"
+        : : "r"(p0), "r"(pout) : "v2", "memory");
+
+    expect[0] = buffer0[0];
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_store_unaligned(void)
+{
+    void *p0 = buffer0;
+    void *pout = output;
+    const size_t offset = 12;
+
+    pout += offset;    /* Create an unaligned address */
+    asm("v2 = vmem(%0 + #0)\n\t"
+        "vmemu(%1 + #0) = v2\n\t"
+        : : "r"(p0), "r"(pout) : "v2", "memory");
+
+    memcpy(expect, buffer0, 2 * sizeof(MMVector));
+    memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
+
+    check_output_w(__LINE__, 2);
+}
+
+static void test_masked_store(bool invert)
+{
+    void *p0 = buffer0;
+    void *pmask = mask;
+    void *pout = output;
+
+    memset(expect, 0xff, sizeof(expect));
+    memset(output, 0xff, sizeof(expect));
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        if (invert) {
+            asm("r4 = #0\n\t"
+                "v4 = vsplat(r4)\n\t"
+                "v5 = vmem(%0 + #0)\n\t"
+                "q0 = vcmp.eq(v4.w, v5.w)\n\t"
+                "v5 = vmem(%1)\n\t"
+                "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
+                : : "r"(pmask), "r"(p0), "r"(pout)
+                : "r4", "v4", "v5", "q0", "memory");
+        } else {
+            asm("r4 = #0\n\t"
+                "v4 = vsplat(r4)\n\t"
+                "v5 = vmem(%0 + #0)\n\t"
+                "q0 = vcmp.eq(v4.w, v5.w)\n\t"
+                "v5 = vmem(%1)\n\t"
+                "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
+                : : "r"(pmask), "r"(p0), "r"(pout)
+                : "r4", "v4", "v5", "q0", "memory");
+        }
+        p0 += sizeof(MMVector);
+        pmask += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            if (invert) {
+                if (i + j % MASKMOD != 0) {
+                    expect[i].w[j] = buffer0[i].w[j];
+                }
+            } else {
+                if (i + j % MASKMOD == 0) {
+                    expect[i].w[j] = buffer0[i].w[j];
+                }
+            }
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+static void test_new_value_store(void)
+{
+    void *p0 = buffer0;
+    void *pout = output;
+
+    asm("{\n\t"
+        "    v2 = vmem(%0 + #0)\n\t"
+        "    vmem(%1 + #0) = v2.new\n\t"
+        "}\n\t"
+        : : "r"(p0), "r"(pout) : "v2", "memory");
+
+    expect[0] = buffer0[0];
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_max_temps()
+{
+    void *p0 = buffer0;
+    void *pout = output;
+
+    asm("v0 = vmem(%0 + #0)\n\t"
+        "v1 = vmem(%0 + #1)\n\t"
+        "v2 = vmem(%0 + #2)\n\t"
+        "v3 = vmem(%0 + #3)\n\t"
+        "v4 = vmem(%0 + #4)\n\t"
+        "{\n\t"
+        "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
+        "    v2.b = vshuffe(v3.b, v2.b)\n\t"
+        "    v3.w = vadd(v1.w, v4.w)\n\t"
+        "    v4.tmp = vmem(%0 + #5)\n\t"
+        "}\n\t"
+        "vmem(%1 + #0) = v0\n\t"
+        "vmem(%1 + #1) = v1\n\t"
+        "vmem(%1 + #2) = v2\n\t"
+        "vmem(%1 + #3) = v3\n\t"
+        "vmem(%1 + #4) = v4\n\t"
+        : : "r"(p0), "r"(pout) : "memory");
+
+        /* The first two vectors come from the vadd-pair instruction */
+        for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
+            expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
+            expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
+        }
+        /* The third vector comes from the vshuffe instruction */
+        for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
+            expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
+                              (buffer0[3].uh[i] & 0xff) << 8;
+        }
+        /* The fourth vector comes from the vadd-single instruction */
+        for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
+            expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
+        }
+        /*
+         * The fifth vector comes from the load to v4
+         * make sure the .tmp is dropped
+         */
+        expect[4] = buffer0[4];
+
+        check_output_b(__LINE__, 5);
+}
+
+#define VEC_OP1(ASM, EL, IN, OUT) \
+    asm("v2 = vmem(%0 + #0)\n\t" \
+        "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
+        "vmem(%1 + #0) = v2\n\t" \
+        : : "r"(IN), "r"(OUT) : "v2", "memory")
+
+#define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
+    asm("v2 = vmem(%0 + #0)\n\t" \
+        "v3 = vmem(%1 + #0)\n\t" \
+        "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
+        "vmem(%2 + #0) = v2\n\t" \
+        : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
+
+#define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
+static void test_##NAME(void) \
+{ \
+    void *pin = buffer0; \
+    void *pout = output; \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        VEC_OP1(ASM, EL, pin, pout); \
+        pin += sizeof(MMVector); \
+        pout += sizeof(MMVector); \
+    } \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
+            expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
+        } \
+    } \
+    check_output_##FIELD(__LINE__, BUFSIZE); \
+}
+
+#define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
+static void test_##NAME(void) \
+{ \
+    void *p0 = buffer0; \
+    void *p1 = buffer1; \
+    void *pout = output; \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        VEC_OP2(ASM, EL, p0, p1, pout); \
+        p0 += sizeof(MMVector); \
+        p1 += sizeof(MMVector); \
+        pout += sizeof(MMVector); \
+    } \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
+            expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
+        } \
+    } \
+    check_output_##FIELD(__LINE__, BUFSIZE); \
+}
+
+#define THRESHOLD        31
+
+#define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
+    asm("r4 = #%3\n\t" \
+        "v1.b = vsplat(r4)\n\t" \
+        "v2 = vmem(%0 + #0)\n\t" \
+        "q0 = vcmp.gt(v2.b, v1.b)\n\t" \
+        "v3 = vmem(%1 + #0)\n\t" \
+        "q1 = vcmp.gt(v3.b, v1.b)\n\t" \
+        "q2 = " #ASM "(q0, " INV "q1)\n\t" \
+        "r4 = #0xff\n\t" \
+        "v1.b = vsplat(r4)\n\t" \
+        "if (q2) vmem(%2 + #0) = v1\n\t" \
+        : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
+        : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
+
+#define TEST_PRED_OP2(NAME, ASM, OP, INV) \
+static void test_##NAME(bool invert) \
+{ \
+    void *p0 = buffer0; \
+    void *p1 = buffer1; \
+    void *pout = output; \
+    memset(output, 0, sizeof(expect)); \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        PRED_OP2(ASM, p0, p1, pout, INV); \
+        p0 += sizeof(MMVector); \
+        p1 += sizeof(MMVector); \
+        pout += sizeof(MMVector); \
+    } \
+    for (int i = 0; i < BUFSIZE; i++) { \
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
+            bool p0 = (buffer0[i].b[j] > THRESHOLD); \
+            bool p1 = (buffer1[i].b[j] > THRESHOLD); \
+            if (invert) { \
+                expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
+            } else { \
+                expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
+            } \
+        } \
+    } \
+    check_output_b(__LINE__, BUFSIZE); \
+}
+
+TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
+TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
+TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
+TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
+TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
+TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
+TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
+TEST_VEC_OP2(vand, vand, , d, 8, &)
+TEST_VEC_OP2(vor, vor, , d, 8, |)
+TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
+
+TEST_PRED_OP2(pred_or, or, |, "")
+TEST_PRED_OP2(pred_or_n, or, |, "!")
+TEST_PRED_OP2(pred_and, and, &, "")
+TEST_PRED_OP2(pred_and_n, and, &, "!")
+TEST_PRED_OP2(pred_xor, xor, ^, "")
+
+int main()
+{
+    init_buffers();
+
+    test_load_tmp();
+    test_load_cur();
+    test_load_aligned();
+    test_load_unaligned();
+    test_store_aligned();
+    test_store_unaligned();
+    test_masked_store(false);
+    test_masked_store(true);
+    test_new_value_store();
+    test_max_temps();
+
+    test_vadd_w();
+    test_vadd_h();
+    test_vadd_b();
+    test_vsub_w();
+    test_vsub_h();
+    test_vsub_b();
+    test_vxor();
+    test_vand();
+    test_vor();
+    test_vnot();
+
+    test_pred_or(false);
+    test_pred_or_n(true);
+    test_pred_and(false);
+    test_pred_and_n(true);
+    test_pred_xor(false);
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? 1 : 0;
+}
diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index c598ac0..59b5bb1 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -42,7 +42,9 @@  HEX_TESTS += vector_add_int
 HEX_TESTS += atomics
 HEX_TESTS += fpstuff
 HEX_TESTS += overflow
+HEX_TESTS += hvx_misc
 
 TESTS += $(HEX_TESTS)
 
 vector_add_int: CFLAGS += -mhvx -fvectorize
+hvx_misc: CFLAGS += -mhvx