diff mbox series

[1/2] drm/amdgpu: make duplicated EOP packet for GFX7/8 have real content

Message ID 20240617105846.1516006-2-uwu@icenowy.me (mailing list archive)
State New, archived
Headers show
Series Fixes of AMD GFX7/8 hang on Loongson platforms | expand

Commit Message

Icenowy Zheng June 17, 2024, 10:58 a.m. UTC
The duplication of EOP packets for GFX7/8, with the former one have
seq-1 written and the latter one have seq written, seems to confuse some
hardware platform (e.g. Loongson 7A series PCIe controllers).

Make the content of the duplicated EOP packet the same with the real
one, only masking any possible interrupts.

Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to gfx8 emit_fence")
Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
 2 files changed, 9 insertions(+), 15 deletions(-)

Comments

Christian König June 17, 2024, 12:35 p.m. UTC | #1
Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> The duplication of EOP packets for GFX7/8, with the former one have
> seq-1 written and the latter one have seq written, seems to confuse some
> hardware platform (e.g. Loongson 7A series PCIe controllers).
>
> Make the content of the duplicated EOP packet the same with the real
> one, only masking any possible interrupts.

Well completely NAK to that, exactly that disables the workaround.

The CPU needs to see two different values written here.

Regards,
Christian.

>
> Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to gfx8 emit_fence")
> Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
> Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
>   2 files changed, 9 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 541dbd70d8c75..778f27f1a34fe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2117,9 +2117,8 @@ static void gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   {
>   	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>   	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> -	/* Workaround for cache flush problems. First send a dummy EOP
> -	 * event down the pipe with seq one below.
> -	 */
> +
> +	/* Workaround for cache flush problems, send EOP twice. */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
>   	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>   				 EOP_TC_ACTION_EN |
> @@ -2127,11 +2126,10 @@ static void gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   				 EVENT_INDEX(5)));
>   	amdgpu_ring_write(ring, addr & 0xfffffffc);
>   	amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
> -				DATA_SEL(1) | INT_SEL(0));
> -	amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> -	amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> +				DATA_SEL(write64bit ? 2 : 1) | INT_SEL(0));
> +	amdgpu_ring_write(ring, lower_32_bits(seq));
> +	amdgpu_ring_write(ring, upper_32_bits(seq));
>   
> -	/* Then send the real EOP event down the pipe. */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
>   	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>   				 EOP_TC_ACTION_EN |
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 2f0e72caee1af..39a7d60f1fd69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6153,9 +6153,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>   	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>   
> -	/* Workaround for cache flush problems. First send a dummy EOP
> -	 * event down the pipe with seq one below.
> -	 */
> +	/* Workaround for cache flush problems, send EOP twice. */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
>   	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>   				 EOP_TC_ACTION_EN |
> @@ -6164,12 +6162,10 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   				 EVENT_INDEX(5)));
>   	amdgpu_ring_write(ring, addr & 0xfffffffc);
>   	amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
> -				DATA_SEL(1) | INT_SEL(0));
> -	amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> -	amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> +			  DATA_SEL(write64bit ? 2 : 1) | INT_SEL(0));
> +	amdgpu_ring_write(ring, lower_32_bits(seq));
> +	amdgpu_ring_write(ring, upper_32_bits(seq));
>   
> -	/* Then send the real EOP event down the pipe:
> -	 * EVENT_WRITE_EOP - flush caches, send int */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
>   	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>   				 EOP_TC_ACTION_EN |
Icenowy Zheng June 17, 2024, 1:03 p.m. UTC | #2
在 2024-06-17星期一的 14:35 +0200,Christian König写道:
> Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> > The duplication of EOP packets for GFX7/8, with the former one have
> > seq-1 written and the latter one have seq written, seems to confuse
> > some
> > hardware platform (e.g. Loongson 7A series PCIe controllers).
> > 
> > Make the content of the duplicated EOP packet the same with the
> > real
> > one, only masking any possible interrupts.
> 
> Well completely NAK to that, exactly that disables the workaround.
> 
> The CPU needs to see two different values written here.

Why do the CPU need to see two different values here? Only the second
packet will raise an interrupt before and after applying this patch,
and the first packet's result should just be overriden on ordinary
platforms. The CPU won't see the first one, until it's polling for the
address for a very short interval, so short that the GPU CP couldn't
execute 2 commands.

Or do you mean the GPU needs to see two different values being written,
or they will be merged into only one write request?

Please give out more information about this workaround, otherwise the
GPU hang problem on Loongson platforms will persist.

> 
> Regards,
> Christian.
> 
> > 
> > Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to
> > gfx8 emit_fence")
> > Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
> > Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
> >   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
> >   2 files changed, 9 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > index 541dbd70d8c75..778f27f1a34fe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > @@ -2117,9 +2117,8 @@ static void
> > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >   {
> >         bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> >         bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > -       /* Workaround for cache flush problems. First send a dummy
> > EOP
> > -        * event down the pipe with seq one below.
> > -        */
> > +
> > +       /* Workaround for cache flush problems, send EOP twice. */
> >         amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
> > 4));
> >         amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> >                                  EOP_TC_ACTION_EN |
> > @@ -2127,11 +2126,10 @@ static void
> > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >                                  EVENT_INDEX(5)));
> >         amdgpu_ring_write(ring, addr & 0xfffffffc);
> >         amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
> > -                               DATA_SEL(1) | INT_SEL(0));
> > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > +                               DATA_SEL(write64bit ? 2 : 1) |
> > INT_SEL(0));
> > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> >   
> > -       /* Then send the real EOP event down the pipe. */
> >         amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
> > 4));
> >         amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> >                                  EOP_TC_ACTION_EN |
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > index 2f0e72caee1af..39a7d60f1fd69 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > @@ -6153,9 +6153,7 @@ static void
> > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >         bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> >         bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> >   
> > -       /* Workaround for cache flush problems. First send a dummy
> > EOP
> > -        * event down the pipe with seq one below.
> > -        */
> > +       /* Workaround for cache flush problems, send EOP twice. */
> >         amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
> > 4));
> >         amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> >                                  EOP_TC_ACTION_EN |
> > @@ -6164,12 +6162,10 @@ static void
> > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >                                  EVENT_INDEX(5)));
> >         amdgpu_ring_write(ring, addr & 0xfffffffc);
> >         amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
> > -                               DATA_SEL(1) | INT_SEL(0));
> > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > +                         DATA_SEL(write64bit ? 2 : 1) |
> > INT_SEL(0));
> > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> >   
> > -       /* Then send the real EOP event down the pipe:
> > -        * EVENT_WRITE_EOP - flush caches, send int */
> >         amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
> > 4));
> >         amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> >                                  EOP_TC_ACTION_EN |
>
Christian König June 17, 2024, 1:09 p.m. UTC | #3
Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
>> Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
>>> The duplication of EOP packets for GFX7/8, with the former one have
>>> seq-1 written and the latter one have seq written, seems to confuse
>>> some
>>> hardware platform (e.g. Loongson 7A series PCIe controllers).
>>>
>>> Make the content of the duplicated EOP packet the same with the
>>> real
>>> one, only masking any possible interrupts.
>> Well completely NAK to that, exactly that disables the workaround.
>>
>> The CPU needs to see two different values written here.
> Why do the CPU need to see two different values here? Only the second
> packet will raise an interrupt before and after applying this patch,
> and the first packet's result should just be overriden on ordinary
> platforms. The CPU won't see the first one, until it's polling for the
> address for a very short interval, so short that the GPU CP couldn't
> execute 2 commands.

Yes exactly that. We need to make two writes, one with the old value 
(seq - 1) and a second with the real value (seq).

Otherwise it is possible that a polling CPU would see the sequence 
before the second EOP is issued with results in incoherent view of memory.

> Or do you mean the GPU needs to see two different values being written,
> or they will be merged into only one write request?
>
> Please give out more information about this workaround, otherwise the
> GPU hang problem on Loongson platforms will persist.

Well if Loongson can't handle two consecutive write operations to the 
same address with different values then you have a massive platform bug.

That is something which can happen all the time throughout the operation.

Regards,
Christian.

>
>> Regards,
>> Christian.
>>
>>> Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to
>>> gfx8 emit_fence")
>>> Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
>>> Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
>>>    2 files changed, 9 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index 541dbd70d8c75..778f27f1a34fe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2117,9 +2117,8 @@ static void
>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>    {
>>>          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>> -       /* Workaround for cache flush problems. First send a dummy
>>> EOP
>>> -        * event down the pipe with seq one below.
>>> -        */
>>> +
>>> +       /* Workaround for cache flush problems, send EOP twice. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> @@ -2127,11 +2126,10 @@ static void
>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>                                   EVENT_INDEX(5)));
>>>          amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
>>> -                               DATA_SEL(1) | INT_SEL(0));
>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>> +                               DATA_SEL(write64bit ? 2 : 1) |
>>> INT_SEL(0));
>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>    
>>> -       /* Then send the real EOP event down the pipe. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 2f0e72caee1af..39a7d60f1fd69 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6153,9 +6153,7 @@ static void
>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>>    
>>> -       /* Workaround for cache flush problems. First send a dummy
>>> EOP
>>> -        * event down the pipe with seq one below.
>>> -        */
>>> +       /* Workaround for cache flush problems, send EOP twice. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> @@ -6164,12 +6162,10 @@ static void
>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>                                   EVENT_INDEX(5)));
>>>          amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
>>> -                               DATA_SEL(1) | INT_SEL(0));
>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>> +                         DATA_SEL(write64bit ? 2 : 1) |
>>> INT_SEL(0));
>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>    
>>> -       /* Then send the real EOP event down the pipe:
>>> -        * EVENT_WRITE_EOP - flush caches, send int */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
Icenowy Zheng June 17, 2024, 1:43 p.m. UTC | #4
在 2024-06-17星期一的 15:09 +0200,Christian König写道:
> Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> > 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
> > > Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> > > > The duplication of EOP packets for GFX7/8, with the former one
> > > > have
> > > > seq-1 written and the latter one have seq written, seems to
> > > > confuse
> > > > some
> > > > hardware platform (e.g. Loongson 7A series PCIe controllers).
> > > > 
> > > > Make the content of the duplicated EOP packet the same with the
> > > > real
> > > > one, only masking any possible interrupts.
> > > Well completely NAK to that, exactly that disables the
> > > workaround.
> > > 
> > > The CPU needs to see two different values written here.
> > Why do the CPU need to see two different values here? Only the
> > second
> > packet will raise an interrupt before and after applying this
> > patch,
> > and the first packet's result should just be overriden on ordinary
> > platforms. The CPU won't see the first one, until it's polling for
> > the
> > address for a very short interval, so short that the GPU CP
> > couldn't
> > execute 2 commands.
> 
> Yes exactly that. We need to make two writes, one with the old value 
> (seq - 1) and a second with the real value (seq).
> 
> Otherwise it is possible that a polling CPU would see the sequence 
> before the second EOP is issued with results in incoherent view of
> memory.

In this case shouldn't we write seq-1 before any work, and then write
seq after work, like what is done in Mesa?

As what I see, Mesa uses another command buffer to emit a
EVENT_WRITE_EOP writing 0, and commit this command buffer before the
real command buffer.

> 
> > Or do you mean the GPU needs to see two different values being
> > written,
> > or they will be merged into only one write request?
> > 
> > Please give out more information about this workaround, otherwise
> > the
> > GPU hang problem on Loongson platforms will persist.
> 
> Well if Loongson can't handle two consecutive write operations to the
> same address with different values then you have a massive platform
> bug.

I think the issue is triggered when two consecutive write operations
and one IRQ is present, which is exactly the case of this function.

> 
> That is something which can happen all the time throughout the
> operation.
> 
> Regards,
> Christian.
> 
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to
> > > > gfx8 emit_fence")
> > > > Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
> > > > Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
> > > > ---
> > > >    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
> > > >    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
> > > >    2 files changed, 9 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > index 541dbd70d8c75..778f27f1a34fe 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > @@ -2117,9 +2117,8 @@ static void
> > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > addr,
> > > >    {
> > > >          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> > > >          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > > -       /* Workaround for cache flush problems. First send a
> > > > dummy
> > > > EOP
> > > > -        * event down the pipe with seq one below.
> > > > -        */
> > > > +
> > > > +       /* Workaround for cache flush problems, send EOP twice.
> > > > */
> > > >          amdgpu_ring_write(ring,
> > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > 4));
> > > >          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > >                                   EOP_TC_ACTION_EN |
> > > > @@ -2127,11 +2126,10 @@ static void
> > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > addr,
> > > >                                   EVENT_INDEX(5)));
> > > >          amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > >          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff)
> > > > |
> > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > +                               DATA_SEL(write64bit ? 2 : 1) |
> > > > INT_SEL(0));
> > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > >    
> > > > -       /* Then send the real EOP event down the pipe. */
> > > >          amdgpu_ring_write(ring,
> > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > 4));
> > > >          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > >                                   EOP_TC_ACTION_EN |
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > index 2f0e72caee1af..39a7d60f1fd69 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > @@ -6153,9 +6153,7 @@ static void
> > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > addr,
> > > >          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> > > >          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > >    
> > > > -       /* Workaround for cache flush problems. First send a
> > > > dummy
> > > > EOP
> > > > -        * event down the pipe with seq one below.
> > > > -        */
> > > > +       /* Workaround for cache flush problems, send EOP twice.
> > > > */
> > > >          amdgpu_ring_write(ring,
> > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > 4));
> > > >          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > >                                   EOP_TC_ACTION_EN |
> > > > @@ -6164,12 +6162,10 @@ static void
> > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > addr,
> > > >                                   EVENT_INDEX(5)));
> > > >          amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > >          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff)
> > > > |
> > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > +                         DATA_SEL(write64bit ? 2 : 1) |
> > > > INT_SEL(0));
> > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > >    
> > > > -       /* Then send the real EOP event down the pipe:
> > > > -        * EVENT_WRITE_EOP - flush caches, send int */
> > > >          amdgpu_ring_write(ring,
> > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > 4));
> > > >          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > >                                   EOP_TC_ACTION_EN |
>
Christian König June 17, 2024, 1:59 p.m. UTC | #5
Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
> 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
>> Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
>>> 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
>>>> Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
>>>>> The duplication of EOP packets for GFX7/8, with the former one
>>>>> have
>>>>> seq-1 written and the latter one have seq written, seems to
>>>>> confuse
>>>>> some
>>>>> hardware platform (e.g. Loongson 7A series PCIe controllers).
>>>>>
>>>>> Make the content of the duplicated EOP packet the same with the
>>>>> real
>>>>> one, only masking any possible interrupts.
>>>> Well completely NAK to that, exactly that disables the
>>>> workaround.
>>>>
>>>> The CPU needs to see two different values written here.
>>> Why do the CPU need to see two different values here? Only the
>>> second
>>> packet will raise an interrupt before and after applying this
>>> patch,
>>> and the first packet's result should just be overriden on ordinary
>>> platforms. The CPU won't see the first one, until it's polling for
>>> the
>>> address for a very short interval, so short that the GPU CP
>>> couldn't
>>> execute 2 commands.
>> Yes exactly that. We need to make two writes, one with the old value
>> (seq - 1) and a second with the real value (seq).
>>
>> Otherwise it is possible that a polling CPU would see the sequence
>> before the second EOP is issued with results in incoherent view of
>> memory.
> In this case shouldn't we write seq-1 before any work, and then write
> seq after work, like what is done in Mesa?

No. This hw workaround requires that two consecutive write operations 
happen directly behind each other on the PCIe bus with two different values.

To make the software logic around that work without any changes we use 
the values seq - 1 and seq because those are guaranteed to be different 
and not trigger any unwanted software behavior.

Only then we can guarantee that we have a coherent view of system memory.

> As what I see, Mesa uses another command buffer to emit a
> EVENT_WRITE_EOP writing 0, and commit this command buffer before the
> real command buffer.
>
>>> Or do you mean the GPU needs to see two different values being
>>> written,
>>> or they will be merged into only one write request?
>>>
>>> Please give out more information about this workaround, otherwise
>>> the
>>> GPU hang problem on Loongson platforms will persist.
>> Well if Loongson can't handle two consecutive write operations to the
>> same address with different values then you have a massive platform
>> bug.
> I think the issue is triggered when two consecutive write operations
> and one IRQ is present, which is exactly the case of this function.

Well then you have a massive platform bug.

Two consecutive writes to the same bus address are perfectly legal from 
the PCIe specification and can happen all the time, even without this 
specific hw workaround.

Regards,
Christian.

>
>> That is something which can happen all the time throughout the
>> operation.
>>
>> Regards,
>> Christian.
>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to
>>>>> gfx8 emit_fence")
>>>>> Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
>>>>> Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
>>>>>     2 files changed, 9 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> index 541dbd70d8c75..778f27f1a34fe 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> @@ -2117,9 +2117,8 @@ static void
>>>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
>>>>> addr,
>>>>>     {
>>>>>           bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>>>           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>>>> -       /* Workaround for cache flush problems. First send a
>>>>> dummy
>>>>> EOP
>>>>> -        * event down the pipe with seq one below.
>>>>> -        */
>>>>> +
>>>>> +       /* Workaround for cache flush problems, send EOP twice.
>>>>> */
>>>>>           amdgpu_ring_write(ring,
>>>>> PACKET3(PACKET3_EVENT_WRITE_EOP,
>>>>> 4));
>>>>>           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>>>                                    EOP_TC_ACTION_EN |
>>>>> @@ -2127,11 +2126,10 @@ static void
>>>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
>>>>> addr,
>>>>>                                    EVENT_INDEX(5)));
>>>>>           amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>>>           amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff)
>>>>> |
>>>>> -                               DATA_SEL(1) | INT_SEL(0));
>>>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>>>> +                               DATA_SEL(write64bit ? 2 : 1) |
>>>>> INT_SEL(0));
>>>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>>>     
>>>>> -       /* Then send the real EOP event down the pipe. */
>>>>>           amdgpu_ring_write(ring,
>>>>> PACKET3(PACKET3_EVENT_WRITE_EOP,
>>>>> 4));
>>>>>           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>>>                                    EOP_TC_ACTION_EN |
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> index 2f0e72caee1af..39a7d60f1fd69 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> @@ -6153,9 +6153,7 @@ static void
>>>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
>>>>> addr,
>>>>>           bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>>>           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>>>>     
>>>>> -       /* Workaround for cache flush problems. First send a
>>>>> dummy
>>>>> EOP
>>>>> -        * event down the pipe with seq one below.
>>>>> -        */
>>>>> +       /* Workaround for cache flush problems, send EOP twice.
>>>>> */
>>>>>           amdgpu_ring_write(ring,
>>>>> PACKET3(PACKET3_EVENT_WRITE_EOP,
>>>>> 4));
>>>>>           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>>>                                    EOP_TC_ACTION_EN |
>>>>> @@ -6164,12 +6162,10 @@ static void
>>>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
>>>>> addr,
>>>>>                                    EVENT_INDEX(5)));
>>>>>           amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>>>           amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff)
>>>>> |
>>>>> -                               DATA_SEL(1) | INT_SEL(0));
>>>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>>>> +                         DATA_SEL(write64bit ? 2 : 1) |
>>>>> INT_SEL(0));
>>>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>>>     
>>>>> -       /* Then send the real EOP event down the pipe:
>>>>> -        * EVENT_WRITE_EOP - flush caches, send int */
>>>>>           amdgpu_ring_write(ring,
>>>>> PACKET3(PACKET3_EVENT_WRITE_EOP,
>>>>> 4));
>>>>>           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>>>                                    EOP_TC_ACTION_EN |
Icenowy Zheng June 17, 2024, 2:30 p.m. UTC | #6
在 2024-06-17星期一的 15:59 +0200,Christian König写道:
> Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
> > 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
> > > Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> > > > 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
> > > > > Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> > > > > > The duplication of EOP packets for GFX7/8, with the former
> > > > > > one
> > > > > > have
> > > > > > seq-1 written and the latter one have seq written, seems to
> > > > > > confuse
> > > > > > some
> > > > > > hardware platform (e.g. Loongson 7A series PCIe
> > > > > > controllers).
> > > > > > 
> > > > > > Make the content of the duplicated EOP packet the same with
> > > > > > the
> > > > > > real
> > > > > > one, only masking any possible interrupts.
> > > > > Well completely NAK to that, exactly that disables the
> > > > > workaround.
> > > > > 
> > > > > The CPU needs to see two different values written here.
> > > > Why do the CPU need to see two different values here? Only the
> > > > second
> > > > packet will raise an interrupt before and after applying this
> > > > patch,
> > > > and the first packet's result should just be overriden on
> > > > ordinary
> > > > platforms. The CPU won't see the first one, until it's polling
> > > > for
> > > > the
> > > > address for a very short interval, so short that the GPU CP
> > > > couldn't
> > > > execute 2 commands.
> > > Yes exactly that. We need to make two writes, one with the old
> > > value
> > > (seq - 1) and a second with the real value (seq).
> > > 
> > > Otherwise it is possible that a polling CPU would see the
> > > sequence
> > > before the second EOP is issued with results in incoherent view
> > > of
> > > memory.
> > In this case shouldn't we write seq-1 before any work, and then
> > write
> > seq after work, like what is done in Mesa?
> 
> No. This hw workaround requires that two consecutive write operations
> happen directly behind each other on the PCIe bus with two different
> values.

Well to be honest the workaround code in Mesa seems to not be working
in this way ...

> 
> To make the software logic around that work without any changes we
> use 
> the values seq - 1 and seq because those are guaranteed to be
> different 
> and not trigger any unwanted software behavior.
> 
> Only then we can guarantee that we have a coherent view of system
> memory.

Any more details about it?

BTW in this case, could I try to write it for 3 times instead of 2,
with seq-1, seq and seq?

> 
> > As what I see, Mesa uses another command buffer to emit a
> > EVENT_WRITE_EOP writing 0, and commit this command buffer before
> > the
> > real command buffer.
> > 
> > > > Or do you mean the GPU needs to see two different values being
> > > > written,
> > > > or they will be merged into only one write request?
> > > > 
> > > > Please give out more information about this workaround,
> > > > otherwise
> > > > the
> > > > GPU hang problem on Loongson platforms will persist.
> > > Well if Loongson can't handle two consecutive write operations to
> > > the
> > > same address with different values then you have a massive
> > > platform
> > > bug.
> > I think the issue is triggered when two consecutive write
> > operations
> > and one IRQ is present, which is exactly the case of this function.
> 
> Well then you have a massive platform bug.
> 
> Two consecutive writes to the same bus address are perfectly legal
> from 
> the PCIe specification and can happen all the time, even without this
> specific hw workaround.

Yes I know it, and I am not from Loongson, just some user trying to
mess around it.

> 
> Regards,
> Christian.
> 
> > 
> > > That is something which can happen all the time throughout the
> > > operation.
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > Fixes: bf26da927a1c ("drm/amdgpu: add cache flush
> > > > > > workaround to
> > > > > > gfx8 emit_fence")
> > > > > > Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK
> > > > > > parts")
> > > > > > Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
> > > > > > ---
> > > > > >     drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
> > > > > >     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
> > > > > >     2 files changed, 9 insertions(+), 15 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > index 541dbd70d8c75..778f27f1a34fe 100644
> > > > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > @@ -2117,9 +2117,8 @@ static void
> > > > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >     {
> > > > > >           bool write64bit = flags &
> > > > > > AMDGPU_FENCE_FLAG_64BIT;
> > > > > >           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > > > > -       /* Workaround for cache flush problems. First send
> > > > > > a
> > > > > > dummy
> > > > > > EOP
> > > > > > -        * event down the pipe with seq one below.
> > > > > > -        */
> > > > > > +
> > > > > > +       /* Workaround for cache flush problems, send EOP
> > > > > > twice.
> > > > > > */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > @@ -2127,11 +2126,10 @@ static void
> > > > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >                                    EVENT_INDEX(5)));
> > > > > >           amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > > > >           amdgpu_ring_write(ring, (upper_32_bits(addr) &
> > > > > > 0xffff)
> > > > > > > 
> > > > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > > > +                               DATA_SEL(write64bit ? 2 :
> > > > > > 1) |
> > > > > > INT_SEL(0));
> > > > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > > > >     
> > > > > > -       /* Then send the real EOP event down the pipe. */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > index 2f0e72caee1af..39a7d60f1fd69 100644
> > > > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > @@ -6153,9 +6153,7 @@ static void
> > > > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >           bool write64bit = flags &
> > > > > > AMDGPU_FENCE_FLAG_64BIT;
> > > > > >           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > > > >     
> > > > > > -       /* Workaround for cache flush problems. First send
> > > > > > a
> > > > > > dummy
> > > > > > EOP
> > > > > > -        * event down the pipe with seq one below.
> > > > > > -        */
> > > > > > +       /* Workaround for cache flush problems, send EOP
> > > > > > twice.
> > > > > > */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > @@ -6164,12 +6162,10 @@ static void
> > > > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >                                    EVENT_INDEX(5)));
> > > > > >           amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > > > >           amdgpu_ring_write(ring, (upper_32_bits(addr) &
> > > > > > 0xffff)
> > > > > > > 
> > > > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > > > +                         DATA_SEL(write64bit ? 2 : 1) |
> > > > > > INT_SEL(0));
> > > > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > > > >     
> > > > > > -       /* Then send the real EOP event down the pipe:
> > > > > > -        * EVENT_WRITE_EOP - flush caches, send int */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
>
Christian König June 17, 2024, 2:42 p.m. UTC | #7
Am 17.06.24 um 16:30 schrieb Icenowy Zheng:
> 在 2024-06-17星期一的 15:59 +0200,Christian König写道:
>> Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
>>> 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
>>>> Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
>>>>> 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
>>>>>> Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
>>>>>>> The duplication of EOP packets for GFX7/8, with the former
>>>>>>> one
>>>>>>> have
>>>>>>> seq-1 written and the latter one have seq written, seems to
>>>>>>> confuse
>>>>>>> some
>>>>>>> hardware platform (e.g. Loongson 7A series PCIe
>>>>>>> controllers).
>>>>>>>
>>>>>>> Make the content of the duplicated EOP packet the same with
>>>>>>> the
>>>>>>> real
>>>>>>> one, only masking any possible interrupts.
>>>>>> Well completely NAK to that, exactly that disables the
>>>>>> workaround.
>>>>>>
>>>>>> The CPU needs to see two different values written here.
>>>>> Why do the CPU need to see two different values here? Only the
>>>>> second
>>>>> packet will raise an interrupt before and after applying this
>>>>> patch,
>>>>> and the first packet's result should just be overriden on
>>>>> ordinary
>>>>> platforms. The CPU won't see the first one, until it's polling
>>>>> for
>>>>> the
>>>>> address for a very short interval, so short that the GPU CP
>>>>> couldn't
>>>>> execute 2 commands.
>>>> Yes exactly that. We need to make two writes, one with the old
>>>> value
>>>> (seq - 1) and a second with the real value (seq).
>>>>
>>>> Otherwise it is possible that a polling CPU would see the
>>>> sequence
>>>> before the second EOP is issued with results in incoherent view
>>>> of
>>>> memory.
>>> In this case shouldn't we write seq-1 before any work, and then
>>> write
>>> seq after work, like what is done in Mesa?
>> No. This hw workaround requires that two consecutive write operations
>> happen directly behind each other on the PCIe bus with two different
>> values.
> Well to be honest the workaround code in Mesa seems to not be working
> in this way ...

Mesa doesn't have any workaround for that hw issue, the code there uses 
a quite different approach.

>> To make the software logic around that work without any changes we
>> use
>> the values seq - 1 and seq because those are guaranteed to be
>> different
>> and not trigger any unwanted software behavior.
>>
>> Only then we can guarantee that we have a coherent view of system
>> memory.
> Any more details about it?

No, sorry. All I know is that it's a bug in the cache flush logic which 
can be worked around by issuing two write behind each other to the same 
location.

> BTW in this case, could I try to write it for 3 times instead of 2,
> with seq-1, seq and seq?

That could potentially work as well, but at some point we would need to 
increase the EOP ring buffer size or could run into performance issues.

>>> As what I see, Mesa uses another command buffer to emit a
>>> EVENT_WRITE_EOP writing 0, and commit this command buffer before
>>> the
>>> real command buffer.
>>>
>>>>> Or do you mean the GPU needs to see two different values being
>>>>> written,
>>>>> or they will be merged into only one write request?
>>>>>
>>>>> Please give out more information about this workaround,
>>>>> otherwise
>>>>> the
>>>>> GPU hang problem on Loongson platforms will persist.
>>>> Well if Loongson can't handle two consecutive write operations to
>>>> the
>>>> same address with different values then you have a massive
>>>> platform
>>>> bug.
>>> I think the issue is triggered when two consecutive write
>>> operations
>>> and one IRQ is present, which is exactly the case of this function.
>> Well then you have a massive platform bug.
>>
>> Two consecutive writes to the same bus address are perfectly legal
>> from
>> the PCIe specification and can happen all the time, even without this
>> specific hw workaround.
> Yes I know it, and I am not from Loongson, just some user trying to
> mess around it.

Well to be honest on a platform where even two consecutive writes to the 
same location doesn't work I would have strong doubts that it is stable 
in general.

Regards,
Christian.
Icenowy Zheng June 17, 2024, 2:57 p.m. UTC | #8
在 2024-06-17星期一的 16:42 +0200,Christian König写道:
> Am 17.06.24 um 16:30 schrieb Icenowy Zheng:
> > 在 2024-06-17星期一的 15:59 +0200,Christian König写道:
> > > Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
> > > > 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
> > > > > Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> > > > > > 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
> > > > > > > Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> > > > > > > > The duplication of EOP packets for GFX7/8, with the
> > > > > > > > former
> > > > > > > > one
> > > > > > > > have
> > > > > > > > seq-1 written and the latter one have seq written,
> > > > > > > > seems to
> > > > > > > > confuse
> > > > > > > > some
> > > > > > > > hardware platform (e.g. Loongson 7A series PCIe
> > > > > > > > controllers).
> > > > > > > > 
> > > > > > > > Make the content of the duplicated EOP packet the same
> > > > > > > > with
> > > > > > > > the
> > > > > > > > real
> > > > > > > > one, only masking any possible interrupts.
> > > > > > > Well completely NAK to that, exactly that disables the
> > > > > > > workaround.
> > > > > > > 
> > > > > > > The CPU needs to see two different values written here.
> > > > > > Why do the CPU need to see two different values here? Only
> > > > > > the
> > > > > > second
> > > > > > packet will raise an interrupt before and after applying
> > > > > > this
> > > > > > patch,
> > > > > > and the first packet's result should just be overriden on
> > > > > > ordinary
> > > > > > platforms. The CPU won't see the first one, until it's
> > > > > > polling
> > > > > > for
> > > > > > the
> > > > > > address for a very short interval, so short that the GPU CP
> > > > > > couldn't
> > > > > > execute 2 commands.
> > > > > Yes exactly that. We need to make two writes, one with the
> > > > > old
> > > > > value
> > > > > (seq - 1) and a second with the real value (seq).
> > > > > 
> > > > > Otherwise it is possible that a polling CPU would see the
> > > > > sequence
> > > > > before the second EOP is issued with results in incoherent
> > > > > view
> > > > > of
> > > > > memory.
> > > > In this case shouldn't we write seq-1 before any work, and then
> > > > write
> > > > seq after work, like what is done in Mesa?
> > > No. This hw workaround requires that two consecutive write
> > > operations
> > > happen directly behind each other on the PCIe bus with two
> > > different
> > > values.
> > Well to be honest the workaround code in Mesa seems to not be
> > working
> > in this way ...
> 
> Mesa doesn't have any workaround for that hw issue, the code there
> uses 
> a quite different approach.

Ah? Commit bf26da927a1c ("drm/amdgpu: add cache flush workaround to
gfx8 emit_fence") says "Both PAL and Mesa use it for gfx8 too, so port
this commit to gfx_v8_0_ring_emit_fence_gfx", so maybe the workaround
should just be not necessary here?


> 
> > > To make the software logic around that work without any changes
> > > we
> > > use
> > > the values seq - 1 and seq because those are guaranteed to be
> > > different
> > > and not trigger any unwanted software behavior.
> > > 
> > > Only then we can guarantee that we have a coherent view of system
> > > memory.
> > Any more details about it?
> 
> No, sorry. All I know is that it's a bug in the cache flush logic
> which 
> can be worked around by issuing two write behind each other to the
> same 
> location.

So the issue is that the first EOP write does not properly flush the
cache? Could EVENT_WRITE be used instead of EVENT_WRITE_EOP in this
workaround to properly flush it without hurting the fence value?


> 
> > BTW in this case, could I try to write it for 3 times instead of 2,
> > with seq-1, seq and seq?
> 
> That could potentially work as well, but at some point we would need
> to 
> increase the EOP ring buffer size or could run into performance
> issues.

Well I will try this. I think the buffer is enlarged in the original
workaround commit.

> 
> > > > As what I see, Mesa uses another command buffer to emit a
> > > > EVENT_WRITE_EOP writing 0, and commit this command buffer
> > > > before
> > > > the
> > > > real command buffer.
> > > > 
> > > > > > Or do you mean the GPU needs to see two different values
> > > > > > being
> > > > > > written,
> > > > > > or they will be merged into only one write request?
> > > > > > 
> > > > > > Please give out more information about this workaround,
> > > > > > otherwise
> > > > > > the
> > > > > > GPU hang problem on Loongson platforms will persist.
> > > > > Well if Loongson can't handle two consecutive write
> > > > > operations to
> > > > > the
> > > > > same address with different values then you have a massive
> > > > > platform
> > > > > bug.
> > > > I think the issue is triggered when two consecutive write
> > > > operations
> > > > and one IRQ is present, which is exactly the case of this
> > > > function.
> > > Well then you have a massive platform bug.
> > > 
> > > Two consecutive writes to the same bus address are perfectly
> > > legal
> > > from
> > > the PCIe specification and can happen all the time, even without
> > > this
> > > specific hw workaround.
> > Yes I know it, and I am not from Loongson, just some user trying to
> > mess around it.
> 
> Well to be honest on a platform where even two consecutive writes to
> the 
> same location doesn't work I would have strong doubts that it is
> stable 
> in general.

Well I think the current situation is that the IRQ triggered by the
second EOP packet arrives before the second write is finished, not the
second write is totally dropped.

> 
> Regards,
> Christian.
Christian König June 17, 2024, 3:07 p.m. UTC | #9
Am 17.06.24 um 16:57 schrieb Icenowy Zheng:
> 在 2024-06-17星期一的 16:42 +0200,Christian König写道:
>> Am 17.06.24 um 16:30 schrieb Icenowy Zheng:
>>> 在 2024-06-17星期一的 15:59 +0200,Christian König写道:
>>>> Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
>>>>> 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
>>>>>> ...
>>>>> In this case shouldn't we write seq-1 before any work, and then
>>>>> write
>>>>> seq after work, like what is done in Mesa?
>>>> No. This hw workaround requires that two consecutive write
>>>> operations
>>>> happen directly behind each other on the PCIe bus with two
>>>> different
>>>> values.
>>> Well to be honest the workaround code in Mesa seems to not be
>>> working
>>> in this way ...
>> Mesa doesn't have any workaround for that hw issue, the code there
>> uses
>> a quite different approach.
> Ah? Commit bf26da927a1c ("drm/amdgpu: add cache flush workaround to
> gfx8 emit_fence") says "Both PAL and Mesa use it for gfx8 too, so port
> this commit to gfx_v8_0_ring_emit_fence_gfx", so maybe the workaround
> should just be not necessary here?

What I meant was that Mesa doesn't have a hack like writing seq - 1 and 
then seq.

I haven't checked the code, but it uses a different approach with 64bit 
values as far as I know.

>>>> To make the software logic around that work without any changes
>>>> we
>>>> use
>>>> the values seq - 1 and seq because those are guaranteed to be
>>>> different
>>>> and not trigger any unwanted software behavior.
>>>>
>>>> Only then we can guarantee that we have a coherent view of system
>>>> memory.
>>> Any more details about it?
>> No, sorry. All I know is that it's a bug in the cache flush logic
>> which
>> can be worked around by issuing two write behind each other to the
>> same
>> location.
> So the issue is that the first EOP write does not properly flush the
> cache? Could EVENT_WRITE be used instead of EVENT_WRITE_EOP in this
> workaround to properly flush it without hurting the fence value?

No, EVENT_WRITE is executed at a different time in the pipeline.

>>> ...
>> Well to be honest on a platform where even two consecutive writes to
>> the
>> same location doesn't work I would have strong doubts that it is
>> stable
>> in general.
> Well I think the current situation is that the IRQ triggered by the
> second EOP packet arrives before the second write is finished, not the
> second write is totally dropped.

Well that sounds like the usual re-ordering problems we have seen 
patches for on Loongson multiple times now.

And I can only repeat what I've wrote before: We don't accept 
workarounds in drivers for problems cause by severely platform issues.

Especially when that is clearly against any PCIe specification.

Regards,
Christian.

>
>> Regards,
>> Christian.
Xi Ruoyao June 17, 2024, 3:35 p.m. UTC | #10
On Mon, 2024-06-17 at 22:30 +0800, Icenowy Zheng wrote:
> > Two consecutive writes to the same bus address are perfectly legal
> > from 
> > the PCIe specification and can happen all the time, even without this
> > specific hw workaround.
> 
> Yes I know it, and I am not from Loongson, just some user trying to
> mess around it.

There are some purposed "workarounds" like reducing the link speed (from
x16 to x8), tweaking the power management setting, etc.  Someone even
claims improving the heat sink of the LS7A chip can help to work around
this issue but I'm really skeptical...
Christian König June 17, 2024, 3:53 p.m. UTC | #11
Am 17.06.24 um 17:35 schrieb Xi Ruoyao:
> On Mon, 2024-06-17 at 22:30 +0800, Icenowy Zheng wrote:
>>> Two consecutive writes to the same bus address are perfectly legal
>>> from
>>> the PCIe specification and can happen all the time, even without this
>>> specific hw workaround.
>> Yes I know it, and I am not from Loongson, just some user trying to
>> mess around it.
> There are some purposed "workarounds" like reducing the link speed (from
> x16 to x8), tweaking the power management setting, etc.  Someone even
> claims improving the heat sink of the LS7A chip can help to work around
> this issue but I'm really skeptical...

Well when it's an ordering problem between writes and interrupts then 
nothing else than getting the order right will fix this. Otherwise it 
can always be that the CPU doesn't see coherent results from PCIe devices.

In other words if the CPU gets an interrupt but doesn't sees the fence 
value written it will assume the work is not done. But since the 
hardware won't trigger a second interrupt the CPU will then keep waiting 
for the operation to finish forever.

This is not limited to GPUs, but will potentially happen with network or 
disk I/O as well.

Regards,
Christian.
Icenowy Zheng June 17, 2024, 4:09 p.m. UTC | #12
在 2024-06-17星期一的 15:59 +0200,Christian König写道:
> Am 17.06.24 um 15:43 schrieb Icenowy Zheng:
> > 在 2024-06-17星期一的 15:09 +0200,Christian König写道:
> > > Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> > > > 在 2024-06-17星期一的 14:35 +0200,Christian König写道:
> > > > > Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
> > > > > > The duplication of EOP packets for GFX7/8, with the former
> > > > > > one
> > > > > > have
> > > > > > seq-1 written and the latter one have seq written, seems to
> > > > > > confuse
> > > > > > some
> > > > > > hardware platform (e.g. Loongson 7A series PCIe
> > > > > > controllers).
> > > > > > 
> > > > > > Make the content of the duplicated EOP packet the same with
> > > > > > the
> > > > > > real
> > > > > > one, only masking any possible interrupts.
> > > > > Well completely NAK to that, exactly that disables the
> > > > > workaround.
> > > > > 
> > > > > The CPU needs to see two different values written here.
> > > > Why do the CPU need to see two different values here? Only the
> > > > second
> > > > packet will raise an interrupt before and after applying this
> > > > patch,
> > > > and the first packet's result should just be overriden on
> > > > ordinary
> > > > platforms. The CPU won't see the first one, until it's polling
> > > > for
> > > > the
> > > > address for a very short interval, so short that the GPU CP
> > > > couldn't
> > > > execute 2 commands.
> > > Yes exactly that. We need to make two writes, one with the old
> > > value
> > > (seq - 1) and a second with the real value (seq).
> > > 
> > > Otherwise it is possible that a polling CPU would see the
> > > sequence
> > > before the second EOP is issued with results in incoherent view
> > > of
> > > memory.
> > In this case shouldn't we write seq-1 before any work, and then
> > write
> > seq after work, like what is done in Mesa?
> 
> No. This hw workaround requires that two consecutive write operations
> happen directly behind each other on the PCIe bus with two different
> values.
> 
> To make the software logic around that work without any changes we
> use 
> the values seq - 1 and seq because those are guaranteed to be
> different 
> and not trigger any unwanted software behavior.
> 
> Only then we can guarantee that we have a coherent view of system
> memory.

BTW is there any operation that could be taken to examine this specific
workaround?

Is there any case possible to reproduce?

> 
> > As what I see, Mesa uses another command buffer to emit a
> > EVENT_WRITE_EOP writing 0, and commit this command buffer before
> > the
> > real command buffer.
> > 
> > > > Or do you mean the GPU needs to see two different values being
> > > > written,
> > > > or they will be merged into only one write request?
> > > > 
> > > > Please give out more information about this workaround,
> > > > otherwise
> > > > the
> > > > GPU hang problem on Loongson platforms will persist.
> > > Well if Loongson can't handle two consecutive write operations to
> > > the
> > > same address with different values then you have a massive
> > > platform
> > > bug.
> > I think the issue is triggered when two consecutive write
> > operations
> > and one IRQ is present, which is exactly the case of this function.
> 
> Well then you have a massive platform bug.
> 
> Two consecutive writes to the same bus address are perfectly legal
> from 
> the PCIe specification and can happen all the time, even without this
> specific hw workaround.
> 
> Regards,
> Christian.
> 
> > 
> > > That is something which can happen all the time throughout the
> > > operation.
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > Fixes: bf26da927a1c ("drm/amdgpu: add cache flush
> > > > > > workaround to
> > > > > > gfx8 emit_fence")
> > > > > > Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK
> > > > > > parts")
> > > > > > Signed-off-by: Icenowy Zheng <uwu@icenowy.me>
> > > > > > ---
> > > > > >     drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
> > > > > >     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
> > > > > >     2 files changed, 9 insertions(+), 15 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > index 541dbd70d8c75..778f27f1a34fe 100644
> > > > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> > > > > > @@ -2117,9 +2117,8 @@ static void
> > > > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >     {
> > > > > >           bool write64bit = flags &
> > > > > > AMDGPU_FENCE_FLAG_64BIT;
> > > > > >           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > > > > -       /* Workaround for cache flush problems. First send
> > > > > > a
> > > > > > dummy
> > > > > > EOP
> > > > > > -        * event down the pipe with seq one below.
> > > > > > -        */
> > > > > > +
> > > > > > +       /* Workaround for cache flush problems, send EOP
> > > > > > twice.
> > > > > > */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > @@ -2127,11 +2126,10 @@ static void
> > > > > > gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >                                    EVENT_INDEX(5)));
> > > > > >           amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > > > >           amdgpu_ring_write(ring, (upper_32_bits(addr) &
> > > > > > 0xffff)
> > > > > > > 
> > > > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > > > +                               DATA_SEL(write64bit ? 2 :
> > > > > > 1) |
> > > > > > INT_SEL(0));
> > > > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > > > >     
> > > > > > -       /* Then send the real EOP event down the pipe. */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > index 2f0e72caee1af..39a7d60f1fd69 100644
> > > > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > > > > @@ -6153,9 +6153,7 @@ static void
> > > > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >           bool write64bit = flags &
> > > > > > AMDGPU_FENCE_FLAG_64BIT;
> > > > > >           bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> > > > > >     
> > > > > > -       /* Workaround for cache flush problems. First send
> > > > > > a
> > > > > > dummy
> > > > > > EOP
> > > > > > -        * event down the pipe with seq one below.
> > > > > > -        */
> > > > > > +       /* Workaround for cache flush problems, send EOP
> > > > > > twice.
> > > > > > */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
> > > > > > @@ -6164,12 +6162,10 @@ static void
> > > > > > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64
> > > > > > addr,
> > > > > >                                    EVENT_INDEX(5)));
> > > > > >           amdgpu_ring_write(ring, addr & 0xfffffffc);
> > > > > >           amdgpu_ring_write(ring, (upper_32_bits(addr) &
> > > > > > 0xffff)
> > > > > > > 
> > > > > > -                               DATA_SEL(1) | INT_SEL(0));
> > > > > > -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
> > > > > > -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
> > > > > > +                         DATA_SEL(write64bit ? 2 : 1) |
> > > > > > INT_SEL(0));
> > > > > > +       amdgpu_ring_write(ring, lower_32_bits(seq));
> > > > > > +       amdgpu_ring_write(ring, upper_32_bits(seq));
> > > > > >     
> > > > > > -       /* Then send the real EOP event down the pipe:
> > > > > > -        * EVENT_WRITE_EOP - flush caches, send int */
> > > > > >           amdgpu_ring_write(ring,
> > > > > > PACKET3(PACKET3_EVENT_WRITE_EOP,
> > > > > > 4));
> > > > > >           amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> > > > > >                                    EOP_TC_ACTION_EN |
>
Christian König June 18, 2024, 6:20 a.m. UTC | #13
Am 17.06.24 um 18:09 schrieb Icenowy Zheng:
> BTW is there any operation that could be taken to examine this specific
> workaround?
>
> Is there any case possible to reproduce?

No idea, I mean that's for GFX7/8 which was released between 2013 and 2017.

My educated guess is that you could create a test where you write 
something to VRAM or GTT with a shader and than in the interrupt handler 
try to observer if you can see those values with the CPU.

If those values aren't visible with the CPU then you have either a cache 
flushing issue or your CPU and/or PCIe root complex is incorrectly 
reordering writes and interrupts.

Regards,
Christian.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 541dbd70d8c75..778f27f1a34fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2117,9 +2117,8 @@  static void gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 {
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
-	/* Workaround for cache flush problems. First send a dummy EOP
-	 * event down the pipe with seq one below.
-	 */
+
+	/* Workaround for cache flush problems, send EOP twice. */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
 	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
 				 EOP_TC_ACTION_EN |
@@ -2127,11 +2126,10 @@  static void gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 				 EVENT_INDEX(5)));
 	amdgpu_ring_write(ring, addr & 0xfffffffc);
 	amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
-				DATA_SEL(1) | INT_SEL(0));
-	amdgpu_ring_write(ring, lower_32_bits(seq - 1));
-	amdgpu_ring_write(ring, upper_32_bits(seq - 1));
+				DATA_SEL(write64bit ? 2 : 1) | INT_SEL(0));
+	amdgpu_ring_write(ring, lower_32_bits(seq));
+	amdgpu_ring_write(ring, upper_32_bits(seq));
 
-	/* Then send the real EOP event down the pipe. */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
 	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
 				 EOP_TC_ACTION_EN |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f0e72caee1af..39a7d60f1fd69 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6153,9 +6153,7 @@  static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
 
-	/* Workaround for cache flush problems. First send a dummy EOP
-	 * event down the pipe with seq one below.
-	 */
+	/* Workaround for cache flush problems, send EOP twice. */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
 	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
 				 EOP_TC_ACTION_EN |
@@ -6164,12 +6162,10 @@  static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 				 EVENT_INDEX(5)));
 	amdgpu_ring_write(ring, addr & 0xfffffffc);
 	amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
-				DATA_SEL(1) | INT_SEL(0));
-	amdgpu_ring_write(ring, lower_32_bits(seq - 1));
-	amdgpu_ring_write(ring, upper_32_bits(seq - 1));
+			  DATA_SEL(write64bit ? 2 : 1) | INT_SEL(0));
+	amdgpu_ring_write(ring, lower_32_bits(seq));
+	amdgpu_ring_write(ring, upper_32_bits(seq));
 
-	/* Then send the real EOP event down the pipe:
-	 * EVENT_WRITE_EOP - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
 	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
 				 EOP_TC_ACTION_EN |