diff mbox series

[v2,03/13] gpu: host1x: Support 40-bit addressing

Message ID 20190124180254.20080-4-thierry.reding@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/tegra: Fix IOVA space on Tegra186 and later | expand

Commit Message

Thierry Reding Jan. 24, 2019, 6:02 p.m. UTC
From: Thierry Reding <treding@nvidia.com>

Tegra186 and later support 40 bits of address space. Additional
registers need to be programmed to store the full 40 bits of push
buffer addresses.

Since command stream gathers can also reside in buffers in a 40-bit
address space, a new variant of the GATHER opcode is also introduced.
It takes two parameters: the first parameter contains the lower 32
bits of the address and the second parameter contains bits 32 to 39.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/hw/cdma_hw.c           | 13 ++++++++---
 drivers/gpu/host1x/hw/channel_hw.c        | 28 +++++++++++++++++++----
 drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
 drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
 4 files changed, 44 insertions(+), 7 deletions(-)

Comments

Mikko Perttunen Jan. 25, 2019, 9:13 a.m. UTC | #1
On 24.1.2019 20.02, Thierry Reding wrote:
> From: Thierry Reding <treding@nvidia.com>
> 
> Tegra186 and later support 40 bits of address space. Additional
> registers need to be programmed to store the full 40 bits of push
> buffer addresses.
> 
> Since command stream gathers can also reside in buffers in a 40-bit
> address space, a new variant of the GATHER opcode is also introduced.
> It takes two parameters: the first parameter contains the lower 32
> bits of the address and the second parameter contains bits 32 to 39.
> 
> Signed-off-by: Thierry Reding <treding@nvidia.com>
> ---
>   drivers/gpu/host1x/hw/cdma_hw.c           | 13 ++++++++---
>   drivers/gpu/host1x/hw/channel_hw.c        | 28 +++++++++++++++++++----
>   drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
>   drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
>   4 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
> index ce320534cbed..6d2b7af2af89 100644
> --- a/drivers/gpu/host1x/hw/cdma_hw.c
> +++ b/drivers/gpu/host1x/hw/cdma_hw.c
> @@ -68,20 +68,27 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
>   static void cdma_start(struct host1x_cdma *cdma)
>   {
>   	struct host1x_channel *ch = cdma_to_channel(cdma);
> +	u64 start, end;
>   
>   	if (cdma->running)
>   		return;
>   
>   	cdma->last_pos = cdma->push_buffer.pos;
> +	start = cdma->push_buffer.dma;
> +	end = cdma->push_buffer.dma + cdma->push_buffer.size + 4;
>   
>   	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
>   			 HOST1X_CHANNEL_DMACTRL);
>   
>   	/* set base, put and end pointer */
> -	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
> +	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
>   	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
> -	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
> -			 HOST1X_CHANNEL_DMAEND);
> +	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
> +
> +#if HOST1X_HW >= 6
> +	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
> +	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
> +#endif
>   
>   	/* reset GET */
>   	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
> diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
> index ff137fe72d34..78fb49539e8c 100644
> --- a/drivers/gpu/host1x/hw/channel_hw.c
> +++ b/drivers/gpu/host1x/hw/channel_hw.c
> @@ -64,11 +64,31 @@ static void submit_gathers(struct host1x_job *job)
>   
>   	for (i = 0; i < job->num_gathers; i++) {
>   		struct host1x_job_gather *g = &job->gathers[i];
> -		u32 op1 = host1x_opcode_gather(g->words);
> -		u32 op2 = g->base + g->offset;
> +		dma_addr_t addr = g->base + g->offset;
> +		u32 op2, op3;
>   
> -		trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
> -		host1x_cdma_push(cdma, op1, op2);
> +		op2 = lower_32_bits(addr);
> +		op3 = upper_32_bits(addr);
> +
> +		trace_write_gather(cdma, g->bo, g->offset, g->words);
> +
> +		if (op3 != 0) {
> +#if HOST1X_HW >= 6
> +			u32 op1 = host1x_opcode_gather_wide(g->words);
> +			u32 op4 = HOST1X_OPCODE_NOP;
> +
> +			host1x_cdma_push(cdma, op1, op2);
> +			host1x_cdma_push(cdma, op3, op4);

This will break if the first push goes as the last slot of the 
ringbuffer and the second push goes as the first slot of the ringbuffer.

Otherwise looks good to me.

> +#else
> +			dev_err(dev, "invalid gather for push buffer %pad\n",
> +				&addr);
> +			continue;
> +#endif
> +		} else {
> +			u32 op1 = host1x_opcode_gather(g->words);
> +
> +			host1x_cdma_push(cdma, op1, op2);
> +		}
>   	}
>   }
>   
> diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
> index eab753b91f24..dd37b10c8d04 100644
> --- a/drivers/gpu/host1x/hw/host1x06_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
> @@ -138,6 +138,11 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>   	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>   }
>   
> +static inline u32 host1x_opcode_gather_wide(unsigned count)
> +{
> +	return (12 << 28) | count;
> +}
> +
>   #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>   
>   #endif
> diff --git a/drivers/gpu/host1x/hw/host1x07_hardware.h b/drivers/gpu/host1x/hw/host1x07_hardware.h
> index a79f57dc87bb..9f6da4ee5443 100644
> --- a/drivers/gpu/host1x/hw/host1x07_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x07_hardware.h
> @@ -138,6 +138,11 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>   	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>   }
>   
> +static inline u32 host1x_opcode_gather_wide(unsigned count)
> +{
> +	return (12 << 28) | count;
> +}
> +
>   #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>   
>   #endif
>
Thierry Reding Jan. 25, 2019, 9:20 a.m. UTC | #2
On Fri, Jan 25, 2019 at 11:13:41AM +0200, Mikko Perttunen wrote:
> On 24.1.2019 20.02, Thierry Reding wrote:
> > From: Thierry Reding <treding@nvidia.com>
> > 
> > Tegra186 and later support 40 bits of address space. Additional
> > registers need to be programmed to store the full 40 bits of push
> > buffer addresses.
> > 
> > Since command stream gathers can also reside in buffers in a 40-bit
> > address space, a new variant of the GATHER opcode is also introduced.
> > It takes two parameters: the first parameter contains the lower 32
> > bits of the address and the second parameter contains bits 32 to 39.
> > 
> > Signed-off-by: Thierry Reding <treding@nvidia.com>
> > ---
> >   drivers/gpu/host1x/hw/cdma_hw.c           | 13 ++++++++---
> >   drivers/gpu/host1x/hw/channel_hw.c        | 28 +++++++++++++++++++----
> >   drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
> >   drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
> >   4 files changed, 44 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
> > index ce320534cbed..6d2b7af2af89 100644
> > --- a/drivers/gpu/host1x/hw/cdma_hw.c
> > +++ b/drivers/gpu/host1x/hw/cdma_hw.c
> > @@ -68,20 +68,27 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
> >   static void cdma_start(struct host1x_cdma *cdma)
> >   {
> >   	struct host1x_channel *ch = cdma_to_channel(cdma);
> > +	u64 start, end;
> >   	if (cdma->running)
> >   		return;
> >   	cdma->last_pos = cdma->push_buffer.pos;
> > +	start = cdma->push_buffer.dma;
> > +	end = cdma->push_buffer.dma + cdma->push_buffer.size + 4;
> >   	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
> >   			 HOST1X_CHANNEL_DMACTRL);
> >   	/* set base, put and end pointer */
> > -	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
> > +	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
> >   	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
> > -	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
> > -			 HOST1X_CHANNEL_DMAEND);
> > +	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
> > +
> > +#if HOST1X_HW >= 6
> > +	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
> > +	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
> > +#endif
> >   	/* reset GET */
> >   	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
> > diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
> > index ff137fe72d34..78fb49539e8c 100644
> > --- a/drivers/gpu/host1x/hw/channel_hw.c
> > +++ b/drivers/gpu/host1x/hw/channel_hw.c
> > @@ -64,11 +64,31 @@ static void submit_gathers(struct host1x_job *job)
> >   	for (i = 0; i < job->num_gathers; i++) {
> >   		struct host1x_job_gather *g = &job->gathers[i];
> > -		u32 op1 = host1x_opcode_gather(g->words);
> > -		u32 op2 = g->base + g->offset;
> > +		dma_addr_t addr = g->base + g->offset;
> > +		u32 op2, op3;
> > -		trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
> > -		host1x_cdma_push(cdma, op1, op2);
> > +		op2 = lower_32_bits(addr);
> > +		op3 = upper_32_bits(addr);
> > +
> > +		trace_write_gather(cdma, g->bo, g->offset, g->words);
> > +
> > +		if (op3 != 0) {
> > +#if HOST1X_HW >= 6
> > +			u32 op1 = host1x_opcode_gather_wide(g->words);
> > +			u32 op4 = HOST1X_OPCODE_NOP;
> > +
> > +			host1x_cdma_push(cdma, op1, op2);
> > +			host1x_cdma_push(cdma, op3, op4);
> 
> This will break if the first push goes as the last slot of the ringbuffer
> and the second push goes as the first slot of the ringbuffer.
> 
> Otherwise looks good to me.

Why would that break? Isn't the purpose of a ringbuffer to behave as if
it was infinitely sequential? If this really is a problem, how do we fix
it? Would we have to stash NOPs into the pushbuffer until we wrap
around?

Thierry
Mikko Perttunen Jan. 25, 2019, 9:32 a.m. UTC | #3
On 25.1.2019 11.20, Thierry Reding wrote:
> On Fri, Jan 25, 2019 at 11:13:41AM +0200, Mikko Perttunen wrote:
>> On 24.1.2019 20.02, Thierry Reding wrote:
>>> From: Thierry Reding <treding@nvidia.com>
>>>
>>> Tegra186 and later support 40 bits of address space. Additional
>>> registers need to be programmed to store the full 40 bits of push
>>> buffer addresses.
>>>
>>> Since command stream gathers can also reside in buffers in a 40-bit
>>> address space, a new variant of the GATHER opcode is also introduced.
>>> It takes two parameters: the first parameter contains the lower 32
>>> bits of the address and the second parameter contains bits 32 to 39.
>>>
>>> Signed-off-by: Thierry Reding <treding@nvidia.com>
>>> ---
>>>    drivers/gpu/host1x/hw/cdma_hw.c           | 13 ++++++++---
>>>    drivers/gpu/host1x/hw/channel_hw.c        | 28 +++++++++++++++++++----
>>>    drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
>>>    drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
>>>    4 files changed, 44 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
>>> index ce320534cbed..6d2b7af2af89 100644
>>> --- a/drivers/gpu/host1x/hw/cdma_hw.c
>>> +++ b/drivers/gpu/host1x/hw/cdma_hw.c
>>> @@ -68,20 +68,27 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
>>>    static void cdma_start(struct host1x_cdma *cdma)
>>>    {
>>>    	struct host1x_channel *ch = cdma_to_channel(cdma);
>>> +	u64 start, end;
>>>    	if (cdma->running)
>>>    		return;
>>>    	cdma->last_pos = cdma->push_buffer.pos;
>>> +	start = cdma->push_buffer.dma;
>>> +	end = cdma->push_buffer.dma + cdma->push_buffer.size + 4;
>>>    	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
>>>    			 HOST1X_CHANNEL_DMACTRL);
>>>    	/* set base, put and end pointer */
>>> -	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
>>> +	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
>>>    	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
>>> -	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
>>> -			 HOST1X_CHANNEL_DMAEND);
>>> +	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
>>> +
>>> +#if HOST1X_HW >= 6
>>> +	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
>>> +	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
>>> +#endif
>>>    	/* reset GET */
>>>    	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
>>> diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
>>> index ff137fe72d34..78fb49539e8c 100644
>>> --- a/drivers/gpu/host1x/hw/channel_hw.c
>>> +++ b/drivers/gpu/host1x/hw/channel_hw.c
>>> @@ -64,11 +64,31 @@ static void submit_gathers(struct host1x_job *job)
>>>    	for (i = 0; i < job->num_gathers; i++) {
>>>    		struct host1x_job_gather *g = &job->gathers[i];
>>> -		u32 op1 = host1x_opcode_gather(g->words);
>>> -		u32 op2 = g->base + g->offset;
>>> +		dma_addr_t addr = g->base + g->offset;
>>> +		u32 op2, op3;
>>> -		trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
>>> -		host1x_cdma_push(cdma, op1, op2);
>>> +		op2 = lower_32_bits(addr);
>>> +		op3 = upper_32_bits(addr);
>>> +
>>> +		trace_write_gather(cdma, g->bo, g->offset, g->words);
>>> +
>>> +		if (op3 != 0) {
>>> +#if HOST1X_HW >= 6
>>> +			u32 op1 = host1x_opcode_gather_wide(g->words);
>>> +			u32 op4 = HOST1X_OPCODE_NOP;
>>> +
>>> +			host1x_cdma_push(cdma, op1, op2);
>>> +			host1x_cdma_push(cdma, op3, op4);
>>
>> This will break if the first push goes as the last slot of the ringbuffer
>> and the second push goes as the first slot of the ringbuffer.
>>
>> Otherwise looks good to me.
> 
> Why would that break? Isn't the purpose of a ringbuffer to behave as if
> it was infinitely sequential? If this really is a problem, how do we fix
> it? Would we have to stash NOPs into the pushbuffer until we wrap
> around?

That's because it's not actually a ringbuffer. It's actually just a 
buffer with a 'RESTART 0' opcode in the last slot. As such, the GATHER_W 
would incorrectly use the 'RESTART 0' opcode as its third word.

Cheers,
Mikko

> 
> Thierry
>
Mikko Perttunen Jan. 25, 2019, 9:34 a.m. UTC | #4
On 25.1.2019 11.32, Mikko Perttunen wrote:
> 
> 
> On 25.1.2019 11.20, Thierry Reding wrote:
>> On Fri, Jan 25, 2019 at 11:13:41AM +0200, Mikko Perttunen wrote:
>>> On 24.1.2019 20.02, Thierry Reding wrote:
>>>> From: Thierry Reding <treding@nvidia.com>
>>>>
>>>> Tegra186 and later support 40 bits of address space. Additional
>>>> registers need to be programmed to store the full 40 bits of push
>>>> buffer addresses.
>>>>
>>>> Since command stream gathers can also reside in buffers in a 40-bit
>>>> address space, a new variant of the GATHER opcode is also introduced.
>>>> It takes two parameters: the first parameter contains the lower 32
>>>> bits of the address and the second parameter contains bits 32 to 39.
>>>>
>>>> Signed-off-by: Thierry Reding <treding@nvidia.com>
>>>> ---
>>>>    drivers/gpu/host1x/hw/cdma_hw.c           | 13 ++++++++---
>>>>    drivers/gpu/host1x/hw/channel_hw.c        | 28 
>>>> +++++++++++++++++++----
>>>>    drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
>>>>    drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
>>>>    4 files changed, 44 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c 
>>>> b/drivers/gpu/host1x/hw/cdma_hw.c
>>>> index ce320534cbed..6d2b7af2af89 100644
>>>> --- a/drivers/gpu/host1x/hw/cdma_hw.c
>>>> +++ b/drivers/gpu/host1x/hw/cdma_hw.c
>>>> @@ -68,20 +68,27 @@ static void cdma_timeout_cpu_incr(struct 
>>>> host1x_cdma *cdma, u32 getptr,
>>>>    static void cdma_start(struct host1x_cdma *cdma)
>>>>    {
>>>>        struct host1x_channel *ch = cdma_to_channel(cdma);
>>>> +    u64 start, end;
>>>>        if (cdma->running)
>>>>            return;
>>>>        cdma->last_pos = cdma->push_buffer.pos;
>>>> +    start = cdma->push_buffer.dma;
>>>> +    end = cdma->push_buffer.dma + cdma->push_buffer.size + 4;
>>>>        host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
>>>>                 HOST1X_CHANNEL_DMACTRL);
>>>>        /* set base, put and end pointer */
>>>> -    host1x_ch_writel(ch, cdma->push_buffer.dma, 
>>>> HOST1X_CHANNEL_DMASTART);
>>>> +    host1x_ch_writel(ch, lower_32_bits(start), 
>>>> HOST1X_CHANNEL_DMASTART);
>>>>        host1x_ch_writel(ch, cdma->push_buffer.pos, 
>>>> HOST1X_CHANNEL_DMAPUT);
>>>> -    host1x_ch_writel(ch, cdma->push_buffer.dma + 
>>>> cdma->push_buffer.size + 4,
>>>> -             HOST1X_CHANNEL_DMAEND);
>>>> +    host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
>>>> +
>>>> +#if HOST1X_HW >= 6
>>>> +    host1x_ch_writel(ch, upper_32_bits(start), 
>>>> HOST1X_CHANNEL_DMASTART_HI);
>>>> +    host1x_ch_writel(ch, upper_32_bits(end), 
>>>> HOST1X_CHANNEL_DMAEND_HI);
>>>> +#endif
>>>>        /* reset GET */
>>>>        host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
>>>> diff --git a/drivers/gpu/host1x/hw/channel_hw.c 
>>>> b/drivers/gpu/host1x/hw/channel_hw.c
>>>> index ff137fe72d34..78fb49539e8c 100644
>>>> --- a/drivers/gpu/host1x/hw/channel_hw.c
>>>> +++ b/drivers/gpu/host1x/hw/channel_hw.c
>>>> @@ -64,11 +64,31 @@ static void submit_gathers(struct host1x_job *job)
>>>>        for (i = 0; i < job->num_gathers; i++) {
>>>>            struct host1x_job_gather *g = &job->gathers[i];
>>>> -        u32 op1 = host1x_opcode_gather(g->words);
>>>> -        u32 op2 = g->base + g->offset;
>>>> +        dma_addr_t addr = g->base + g->offset;
>>>> +        u32 op2, op3;
>>>> -        trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
>>>> -        host1x_cdma_push(cdma, op1, op2);
>>>> +        op2 = lower_32_bits(addr);
>>>> +        op3 = upper_32_bits(addr);
>>>> +
>>>> +        trace_write_gather(cdma, g->bo, g->offset, g->words);
>>>> +
>>>> +        if (op3 != 0) {
>>>> +#if HOST1X_HW >= 6
>>>> +            u32 op1 = host1x_opcode_gather_wide(g->words);
>>>> +            u32 op4 = HOST1X_OPCODE_NOP;
>>>> +
>>>> +            host1x_cdma_push(cdma, op1, op2);
>>>> +            host1x_cdma_push(cdma, op3, op4);
>>>
>>> This will break if the first push goes as the last slot of the 
>>> ringbuffer
>>> and the second push goes as the first slot of the ringbuffer.
>>>
>>> Otherwise looks good to me.
>>
>> Why would that break? Isn't the purpose of a ringbuffer to behave as if
>> it was infinitely sequential? If this really is a problem, how do we fix
>> it? Would we have to stash NOPs into the pushbuffer until we wrap
>> around?
> 
> That's because it's not actually a ringbuffer. It's actually just a 
> buffer with a 'RESTART 0' opcode in the last slot. As such, the GATHER_W 
> would incorrectly use the 'RESTART 0' opcode as its third word.

And yes, detecting the situation and filling with NOP is one solution. 
The reason cdma_push currently takes two words is to guarantee that 
two-word opcodes can always fit in the buffer without splitting.

> 
> Cheers,
> Mikko
> 
>>
>> Thierry
>>
diff mbox series

Patch

diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index ce320534cbed..6d2b7af2af89 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -68,20 +68,27 @@  static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
 static void cdma_start(struct host1x_cdma *cdma)
 {
 	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u64 start, end;
 
 	if (cdma->running)
 		return;
 
 	cdma->last_pos = cdma->push_buffer.pos;
+	start = cdma->push_buffer.dma;
+	end = cdma->push_buffer.dma + cdma->push_buffer.size + 4;
 
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
 			 HOST1X_CHANNEL_DMACTRL);
 
 	/* set base, put and end pointer */
-	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
 	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
-	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
-			 HOST1X_CHANNEL_DMAEND);
+	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
+
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
+	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
+#endif
 
 	/* reset GET */
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index ff137fe72d34..78fb49539e8c 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -64,11 +64,31 @@  static void submit_gathers(struct host1x_job *job)
 
 	for (i = 0; i < job->num_gathers; i++) {
 		struct host1x_job_gather *g = &job->gathers[i];
-		u32 op1 = host1x_opcode_gather(g->words);
-		u32 op2 = g->base + g->offset;
+		dma_addr_t addr = g->base + g->offset;
+		u32 op2, op3;
 
-		trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
-		host1x_cdma_push(cdma, op1, op2);
+		op2 = lower_32_bits(addr);
+		op3 = upper_32_bits(addr);
+
+		trace_write_gather(cdma, g->bo, g->offset, g->words);
+
+		if (op3 != 0) {
+#if HOST1X_HW >= 6
+			u32 op1 = host1x_opcode_gather_wide(g->words);
+			u32 op4 = HOST1X_OPCODE_NOP;
+
+			host1x_cdma_push(cdma, op1, op2);
+			host1x_cdma_push(cdma, op3, op4);
+#else
+			dev_err(dev, "invalid gather for push buffer %pad\n",
+				&addr);
+			continue;
+#endif
+		} else {
+			u32 op1 = host1x_opcode_gather(g->words);
+
+			host1x_cdma_push(cdma, op1, op2);
+		}
 	}
 }
 
diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
index eab753b91f24..dd37b10c8d04 100644
--- a/drivers/gpu/host1x/hw/host1x06_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
@@ -138,6 +138,11 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_gather_wide(unsigned count)
+{
+	return (12 << 28) | count;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x07_hardware.h b/drivers/gpu/host1x/hw/host1x07_hardware.h
index a79f57dc87bb..9f6da4ee5443 100644
--- a/drivers/gpu/host1x/hw/host1x07_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x07_hardware.h
@@ -138,6 +138,11 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_gather_wide(unsigned count)
+{
+	return (12 << 28) | count;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif