diff mbox

[RFC,v2,8/8] drm: tegra: Add gr2d device

Message ID 1353935954-13763-9-git-send-email-tbergstrom@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Terje Bergstrom Nov. 26, 2012, 1:19 p.m. UTC
Add client driver for 2D device.

Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/drm/tegra/Makefile |    2 +-
 drivers/gpu/drm/tegra/drm.c    |  231 +++++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/tegra/drm.h    |   42 ++++++--
 drivers/gpu/drm/tegra/gr2d.c   |  224 ++++++++++++++++++++++++++++++++++++++
 include/drm/tegra_drm.h        |  129 ++++++++++++++++++++++
 5 files changed, 615 insertions(+), 13 deletions(-)
 create mode 100644 drivers/gpu/drm/tegra/gr2d.c
 create mode 100644 include/drm/tegra_drm.h

Comments

Rob Clark Nov. 26, 2012, 9:59 p.m. UTC | #1
On Mon, Nov 26, 2012 at 7:19 AM, Terje Bergstrom <tbergstrom@nvidia.com> wrote:
>
> +struct tegra_drm_submit_args {
> +       void *context;

Just a quick comment..

You shouldn't really use ptr here, but instead use a 64bit type so
that you don't run into issues later for armv8/64bit.  Same comment
applies in a few other places too.

I'll try and spend a bit more time going through this in more detail
in the coming days

BR,
-R

> +       __u32 num_syncpt_incrs;
> +       __u32 num_cmdbufs;
> +       __u32 num_relocs;
> +       __u32 submit_version;
> +       __u32 num_waitchks;
> +       __u32 waitchk_mask;
> +       __u32 timeout;
> +       struct tegra_drm_syncpt_incrs *syncpt_incrs;
> +       struct tegra_drm_cmdbuf *cmdbufs;
> +       struct tegra_drm_reloc *relocs;
> +       struct tegra_drm_waitchk *waitchks;
> +
> +       __u32 pad[5];           /* future expansion */
> +       __u32 fence;            /* Return value */
> +};
> +
Dave Airlie Nov. 26, 2012, 10:15 p.m. UTC | #2
>  static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
>  {
> -       return 0;
> +       struct tegra_drm_fpriv *fpriv;
> +       int err = 0;
> +
> +       fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
> +       if (!fpriv)
> +               return -ENOMEM;
> +
> +       INIT_LIST_HEAD(&fpriv->contexts);
> +       filp->driver_priv = fpriv;
> +

who frees this?
> +struct tegra_drm_syncpt_incr_args {
> +       __u32 id;
> +};

add 32-bits of padding here

> +
> +struct tegra_drm_syncpt_wait_args {
> +       __u32 id;
> +       __u32 thresh;
> +       __s32 timeout;
> +       __u32 value;
> +};
> +
> +#define DRM_TEGRA_NO_TIMEOUT   (-1)
> +
> +struct tegra_drm_open_channel_args {
> +       __u32 class;
> +       void *context;

no pointers use u64, align them to 64-bits, so 32-bits of padding,

> +};
> +
> +struct tegra_drm_get_channel_param_args {
> +       void *context;
> +       __u32 value;

Same padding + uint64_t for void *

> +};
> +
> +struct tegra_drm_syncpt_incr {
> +       __u32 syncpt_id;
> +       __u32 syncpt_incrs;
> +};
> +
> +struct tegra_drm_cmdbuf {
> +       __u32 mem;
> +       __u32 offset;
> +       __u32 words;
> +};

add padding
> +
> +struct tegra_drm_reloc {
> +       __u32 cmdbuf_mem;
> +       __u32 cmdbuf_offset;
> +       __u32 target;
> +       __u32 target_offset;
> +       __u32 shift;
> +};

add padding

> +
> +struct tegra_drm_waitchk {
> +       __u32 mem;
> +       __u32 offset;
> +       __u32 syncpt_id;
> +       __u32 thresh;
> +};
> +
> +struct tegra_drm_submit_args {
> +       void *context;
> +       __u32 num_syncpt_incrs;
> +       __u32 num_cmdbufs;
> +       __u32 num_relocs;
> +       __u32 submit_version;
> +       __u32 num_waitchks;
> +       __u32 waitchk_mask;
> +       __u32 timeout;
> +       struct tegra_drm_syncpt_incrs *syncpt_incrs;
> +       struct tegra_drm_cmdbuf *cmdbufs;
> +       struct tegra_drm_reloc *relocs;
> +       struct tegra_drm_waitchk *waitchks;
> +
> +       __u32 pad[5];           /* future expansion */
> +       __u32 fence;            /* Return value */
> +};

lose all the pointers for 64-bit aligned uint64_t.

Probably should align all of these on __u64 and __u32 usage if possible.

i'll look at the rest of the patches, but I need to know what commands
can be submitted via this interface and what are the security
implications of it.

Dave.
Terje Bergstrom Nov. 27, 2012, 6:52 a.m. UTC | #3
On 27.11.2012 00:15, Dave Airlie wrote:
>>  static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
>>  {
>> -       return 0;
>> +       struct tegra_drm_fpriv *fpriv;
>> +       int err = 0;
>> +
>> +       fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
>> +       if (!fpriv)
>> +               return -ENOMEM;
>> +
>> +       INIT_LIST_HEAD(&fpriv->contexts);
>> +       filp->driver_priv = fpriv;
>> +
> 
> who frees this?

Probably nobody. Will fix.

>> +struct tegra_drm_syncpt_incr_args {
>> +       __u32 id;
>> +};
> 
> add 32-bits of padding here
> 
>> +
>> +struct tegra_drm_syncpt_wait_args {
>> +       __u32 id;
>> +       __u32 thresh;
>> +       __s32 timeout;
>> +       __u32 value;
>> +};
>> +
>> +#define DRM_TEGRA_NO_TIMEOUT   (-1)
>> +
>> +struct tegra_drm_open_channel_args {
>> +       __u32 class;
>> +       void *context;
> 
> no pointers use u64, align them to 64-bits, so 32-bits of padding,

I'll add the paddings and change pointers to u64's to all of the structs
in this file.

> i'll look at the rest of the patches, but I need to know what commands
> can be submitted via this interface and what are the security
> implications of it.

All of the commands are memory operations (copy, clear, rotate, etc)
involving either one or two memory regions that are defined via dmabuf
fd's and offsets. The commands can also contain plain device virtual
addresses and 2D would be happy to oblige as long as the memory is
mapped to it.

There are a few ways to help the situation. None of them are perfect.

On Tegra30 we could allocate an address space per process. This would
mean that max 3 processes would be able to use the 2D unit at one time,
assuming that other devices are find using the one remaining address
space. On Tegra20 this is not an option.

Second would be using device permissions - only allow selected processes
to access 2D.

Third would be having a firewall in 2D driver checking the stream and
ensuring all registers that accept addresses are written by values
derived from dmabufs. I haven't tried implementing this, but it'd
involve a lookup table in kernel and CPU reading through the command
stream. Offsets and sizes would also need to be validated. There would
be a performance hit.

Fourth would be to move the creation of streams to kernel space. That'd
mean moving the whole 2D driver and host1x command stream handling to
kernel space and quite a lot of time spent in kernel. I'm not too keen
on this for obvious reasons.

Other ideas are obviously welcome.

Thanks!

Terje
Dave Airlie Nov. 27, 2012, 7:33 a.m. UTC | #4
>
> Third would be having a firewall in 2D driver checking the stream and
> ensuring all registers that accept addresses are written by values
> derived from dmabufs. I haven't tried implementing this, but it'd
> involve a lookup table in kernel and CPU reading through the command
> stream. Offsets and sizes would also need to be validated. There would
> be a performance hit.

This is the standard mechanism, and what exynos does as well.

The per process VM method is also used as an extension to this on some hw.

Dave.
Terje Bergstrom Nov. 27, 2012, 8:16 a.m. UTC | #5
On 27.11.2012 09:33, Dave Airlie wrote:
>> Third would be having a firewall in 2D driver checking the stream and
>> ensuring all registers that accept addresses are written by values
>> derived from dmabufs. I haven't tried implementing this, but it'd
>> involve a lookup table in kernel and CPU reading through the command
>> stream. Offsets and sizes would also need to be validated. There would
>> be a performance hit.
> 
> This is the standard mechanism, and what exynos does as well.
> 
> The per process VM method is also used as an extension to this on some hw.

Hi,

Thanks for the pointer, I looked at exynos code. It indeed checks the
registers written to, but it doesn't prevent overrun by checking sizes
of buffers and compare against requests.

Based on my experience with Tegra graphics stack, going through command
streams in kernel is bad for performance. For 2D operations this is
probably ok as the command streams are pretty simple. Anything more
complex is going to cause severe degradation of performance, but it's
also outside the scope of this patch set.

If this is the way to go, I'll put the firewall behind a Kconfig flag so
that system integrator can decide if his system needs it.

Terje
Dave Airlie Nov. 27, 2012, 8:32 a.m. UTC | #6
On Tue, Nov 27, 2012 at 8:16 AM, Terje Bergström <tbergstrom@nvidia.com> wrote:
> On 27.11.2012 09:33, Dave Airlie wrote:
>>> Third would be having a firewall in 2D driver checking the stream and
>>> ensuring all registers that accept addresses are written by values
>>> derived from dmabufs. I haven't tried implementing this, but it'd
>>> involve a lookup table in kernel and CPU reading through the command
>>> stream. Offsets and sizes would also need to be validated. There would
>>> be a performance hit.
>>
>> This is the standard mechanism, and what exynos does as well.
>>
>> The per process VM method is also used as an extension to this on some hw.
>
> Hi,
>
> Thanks for the pointer, I looked at exynos code. It indeed checks the
> registers written to, but it doesn't prevent overrun by checking sizes
> of buffers and compare against requests.

They probably need to add that, its not as important as the base
addresses, unless it takes negative strides, generally base addresses
means you can target current->uid quite easily!

> Based on my experience with Tegra graphics stack, going through command
> streams in kernel is bad for performance. For 2D operations this is
> probably ok as the command streams are pretty simple. Anything more
> complex is going to cause severe degradation of performance, but it's
> also outside the scope of this patch set.
>
> If this is the way to go, I'll put the firewall behind a Kconfig flag so
> that system integrator can decide if his system needs it.

We don't generally make security like this optional :-)

If you do that you should restrict the drm device to root users only,
and never let a user with a browser anywhere near it.

Like I know what you guys get away with in closed source world, but
here we don't write root holes into our driver deliberately.

Dave.
Terje Bergstrom Nov. 27, 2012, 8:45 a.m. UTC | #7
On 27.11.2012 10:32, Dave Airlie wrote:
> On Tue, Nov 27, 2012 at 8:16 AM, Terje Bergström <tbergstrom@nvidia.com> wrote:
>> Thanks for the pointer, I looked at exynos code. It indeed checks the
>> registers written to, but it doesn't prevent overrun by checking sizes
>> of buffers and compare against requests.
> They probably need to add that, its not as important as the base
> addresses, unless it takes negative strides, generally base addresses
> means you can target current->uid quite easily!

Ok. We'll implement the firewall, unless we come up with even a better
choice.

>> If this is the way to go, I'll put the firewall behind a Kconfig flag so
>> that system integrator can decide if his system needs it.
> We don't generally make security like this optional :-)
> 
> If you do that you should restrict the drm device to root users only,
> and never let a user with a browser anywhere near it.

My thinking was that the system integrator can decide how much he trusts
the binaries (incl. browser plugins) in the system. If he trusts the
binaries, the firewall can be turned off.

> Like I know what you guys get away with in closed source world, but
> here we don't write root holes into our driver deliberately.

Duly noted. :-)

Terje
Lucas Stach Nov. 27, 2012, 10:22 a.m. UTC | #8
Am Dienstag, den 27.11.2012, 10:45 +0200 schrieb Terje Bergström:
> On 27.11.2012 10:32, Dave Airlie wrote:
> > On Tue, Nov 27, 2012 at 8:16 AM, Terje Bergström <tbergstrom@nvidia.com> wrote:
> >> Thanks for the pointer, I looked at exynos code. It indeed checks the
> >> registers written to, but it doesn't prevent overrun by checking sizes
> >> of buffers and compare against requests.
> > They probably need to add that, its not as important as the base
> > addresses, unless it takes negative strides, generally base addresses
> > means you can target current->uid quite easily!
> 
> Ok. We'll implement the firewall, unless we come up with even a better
> choice.
> 
> >> If this is the way to go, I'll put the firewall behind a Kconfig flag so
> >> that system integrator can decide if his system needs it.
> > We don't generally make security like this optional :-)
> > 
> > If you do that you should restrict the drm device to root users only,
> > and never let a user with a browser anywhere near it.
> 
Personally I would never trust any binary, but that's just my personal
opinion.

But I'm in favour of having the command stream checking optional, simply
backed by the fact that we are likely to use the same 2D driver
infrastructure for Tegra 2 and 3. On Tegra 3 we can most likely go
without in-depth command stream checking as the graphics core there sits
behind the IOMMU, which can provide an appropriate level of security.

Regards,
Lucas
Thierry Reding Nov. 27, 2012, 10:37 a.m. UTC | #9
On Tue, Nov 27, 2012 at 11:22:56AM +0100, Lucas Stach wrote:
> Am Dienstag, den 27.11.2012, 10:45 +0200 schrieb Terje Bergström:
> > On 27.11.2012 10:32, Dave Airlie wrote:
> > > On Tue, Nov 27, 2012 at 8:16 AM, Terje Bergström <tbergstrom@nvidia.com> wrote:
> > >> Thanks for the pointer, I looked at exynos code. It indeed checks the
> > >> registers written to, but it doesn't prevent overrun by checking sizes
> > >> of buffers and compare against requests.
> > > They probably need to add that, its not as important as the base
> > > addresses, unless it takes negative strides, generally base addresses
> > > means you can target current->uid quite easily!
> > 
> > Ok. We'll implement the firewall, unless we come up with even a better
> > choice.
> > 
> > >> If this is the way to go, I'll put the firewall behind a Kconfig flag so
> > >> that system integrator can decide if his system needs it.
> > > We don't generally make security like this optional :-)
> > > 
> > > If you do that you should restrict the drm device to root users only,
> > > and never let a user with a browser anywhere near it.
> > 
> Personally I would never trust any binary, but that's just my personal
> opinion.
> 
> But I'm in favour of having the command stream checking optional, simply
> backed by the fact that we are likely to use the same 2D driver
> infrastructure for Tegra 2 and 3. On Tegra 3 we can most likely go
> without in-depth command stream checking as the graphics core there sits
> behind the IOMMU, which can provide an appropriate level of security.

But in that case it should be made mandatory at first until proper IOMMU
support is enabled on Tegra30. Then it can be checked at driver probe
time whether or not to enable the extra checks. That way we don't need a
special Kconfig option and we still get all the security that we need,
right?

Thierry
Terje Bergstrom Nov. 27, 2012, 11:31 a.m. UTC | #10
On 27.11.2012 12:37, Thierry Reding wrote:
> But in that case it should be made mandatory at first until proper IOMMU
> support is enabled on Tegra30. Then it can be checked at driver probe
> time whether or not to enable the extra checks. That way we don't need a
> special Kconfig option and we still get all the security that we need,
> right?

I guess it depends on the level of security.

If we want to only protect kernel and user space memory, this would be
sufficient and no firewall is needed if IOMMU is turned on.

If we want to protect 2D buffers from each other, this is not sufficient.

Terje
Lucas Stach Nov. 27, 2012, 11:47 a.m. UTC | #11
Am Dienstag, den 27.11.2012, 13:31 +0200 schrieb Terje Bergström:
> On 27.11.2012 12:37, Thierry Reding wrote:
> > But in that case it should be made mandatory at first until proper IOMMU
> > support is enabled on Tegra30. Then it can be checked at driver probe
> > time whether or not to enable the extra checks. That way we don't need a
> > special Kconfig option and we still get all the security that we need,
> > right?
> 
> I guess it depends on the level of security.
> 
> If we want to only protect kernel and user space memory, this would be
> sufficient and no firewall is needed if IOMMU is turned on.
> 
> If we want to protect 2D buffers from each other, this is not sufficient.
> 
I guess we could change IOMMU address spaces for the graphics units
depending on the active channel. This would still be a bit of a
performance hit, because of the necessary TLB flushing and so on, but
should be much better than checking the whole command stream. This way
you at least get security on a process level, as no process is able to
corrupt another processes graphics resources.

This is the same level of security as provided by the nouveau driver.
But to do so all memory management has to be done in kernel and from the
current submissions of the 2D infrastructure I fear that the current
architecture does too much of that in userspace, but I'll hold back with
any judgement until we actually get to see the userspace parts.

Also to implement this strategy you have to take ownership of the
graphics address space on a much lower level than the DMA API. This
might take some work together with the IOMMU guys.

Regards,
Lucas
Terje Bergstrom Nov. 27, 2012, 12:59 p.m. UTC | #12
On 27.11.2012 13:47, Lucas Stach wrote:
> I guess we could change IOMMU address spaces for the graphics units
> depending on the active channel. This would still be a bit of a
> performance hit, because of the necessary TLB flushing and so on, but
> should be much better than checking the whole command stream. This way
> you at least get security on a process level, as no process is able to
> corrupt another processes graphics resources.

One physical channel is shared with all users of the 2D unit. Each job
is just added to the queue, and host1x will happily cross from one job
to the next without intervention from CPU. This is done to keep CPU
overhead down to improve power and performance.

This also means that we cannot change the IOMMU settings between jobs
from different processes, unless we pause the channel after every job.

This is still an interesting thought - can we postpone binding of a
buffer to address space until submit time, and give each process its own
address space? We would have a limit of "submits from three processes
going on at once" instead of "three processes can open 2D channel at
once". That's a limitation we could live with.

Naturally, Tegra2 is still left in the cold.

> This is the same level of security as provided by the nouveau driver.
> But to do so all memory management has to be done in kernel and from the
> current submissions of the 2D infrastructure I fear that the current
> architecture does too much of that in userspace, but I'll hold back with
> any judgement until we actually get to see the userspace parts.

User space allocates buffer, exports as dmabuf fd, and passes the fd in
submits to kernel, and frees the buffer. No other memory management is
done in user space.

> Also to implement this strategy you have to take ownership of the
> graphics address space on a much lower level than the DMA API. This
> might take some work together with the IOMMU guys.

I'll go through this with Hiroshi, who knows that area.

Terje
Dave Airlie Nov. 27, 2012, 11 p.m. UTC | #13
On Tue, Nov 27, 2012 at 9:31 PM, Terje Bergström <tbergstrom@nvidia.com> wrote:
> On 27.11.2012 12:37, Thierry Reding wrote:
>> But in that case it should be made mandatory at first until proper IOMMU
>> support is enabled on Tegra30. Then it can be checked at driver probe
>> time whether or not to enable the extra checks. That way we don't need a
>> special Kconfig option and we still get all the security that we need,
>> right?
>
> I guess it depends on the level of security.
>
> If we want to only protect kernel and user space memory, this would be
> sufficient and no firewall is needed if IOMMU is turned on.
>
> If we want to protect 2D buffers from each other, this is not sufficient.

We  generally aim for the first, to stop the gpu from reading/writing
any memory it hasn't been granted access to,
the second is nice to have though, but really requires a GPU with VM
to implement properly.

Dave.
Terje Bergstrom Nov. 28, 2012, 1:17 p.m. UTC | #14
On 28.11.2012 01:00, Dave Airlie wrote:
> We  generally aim for the first, to stop the gpu from reading/writing
> any memory it hasn't been granted access to,
> the second is nice to have though, but really requires a GPU with VM
> to implement properly.

I wonder if we should aim at root only access on Tegra20, and force
IOMMU on Tegra30 and fix the remaining issues we have with IOMMU. The
firewall turns out to be more complicated than I wished.

Biggest problem is that we aim at zero-copy for everything possible,
including command streams. Kernel gets a handle to a command stream, but
the command stream is allocated by the user space process. So the user
space can tamper with the stream once it's been written to the host1x 2D
channel.

Copying with firewall is one option, but that would again kill the
performance. One option would be user space unmapping the command buffer
when it's sent to kernel, and kernel checking that it's unmapped before
it agrees to send the stream to hardware.

On Tegra30 with IOMMU turned on things are ok without any checks,
because all access would go via MMU, which makes kernel memory inaccessible.

Of course, better ideas are welcome.

Terje
Lucas Stach Nov. 28, 2012, 1:33 p.m. UTC | #15
Am Mittwoch, den 28.11.2012, 15:17 +0200 schrieb Terje Bergström:
> On 28.11.2012 01:00, Dave Airlie wrote:
> > We  generally aim for the first, to stop the gpu from reading/writing
> > any memory it hasn't been granted access to,
> > the second is nice to have though, but really requires a GPU with VM
> > to implement properly.
> 
> I wonder if we should aim at root only access on Tegra20, and force
> IOMMU on Tegra30 and fix the remaining issues we have with IOMMU. The
> firewall turns out to be more complicated than I wished.
> 
> Biggest problem is that we aim at zero-copy for everything possible,
> including command streams. Kernel gets a handle to a command stream, but
> the command stream is allocated by the user space process. So the user
> space can tamper with the stream once it's been written to the host1x 2D
> channel.
> 
So this is obviously wrong. Userspace has to allocate a pushbuffer from
the kernel just as every other buffer, then map it into it's own address
space to push in commands. At submit time of the pushbuf kernel has to
make sure that userspace is not able to access the memory any more, i.e.
kernel shoots down the vma or pagetable of the vma. To keep overhead low
and not do any blocking you can just keep some pushbufs around for one
channel and switch over the pagetable entries to the next free buffer,
just make sure that userspace is never able to tamper with a buffer as
long as the gpu isn't done with it.

Regards,
Lucas
Terje Bergstrom Nov. 28, 2012, 1:57 p.m. UTC | #16
On 28.11.2012 15:33, Lucas Stach wrote:
> So this is obviously wrong. Userspace has to allocate a pushbuffer from
> the kernel just as every other buffer, then map it into it's own address
> space to push in commands. At submit time of the pushbuf kernel has to
> make sure that userspace is not able to access the memory any more, i.e.
> kernel shoots down the vma or pagetable of the vma. To keep overhead low
> and not do any blocking you can just keep some pushbufs around for one
> channel and switch over the pagetable entries to the next free buffer,
> just make sure that userspace is never able to tamper with a buffer as
> long as the gpu isn't done with it.

That's really not something dma-buf APIs are equipped to handle. We need
something to ensure user space doesn't have the buffer mapped (either
return error if has, or zap the mapping), something to ensure user space
cannot mmap the buffer, and something to revert this all once we're done.

We could add these as special ops to tegradrm dmabuf code for now, and
assume that command streams are always allocated by tegradrm. Now we
allow any dmabuf to be used as buffers for command streams.

And, with IOMMU I don't think we would need any of this. I guess we need
to press the gas pedal on figuring out how to enable that for tegradrm
on Tegra30.

We already allocate multiple buffers to be able to fill in the next
buffer once we've send one to kernel, so that part is ok. We reuse only
once we know that the operations contained are done.

Terje
Lucas Stach Nov. 28, 2012, 2:06 p.m. UTC | #17
Am Mittwoch, den 28.11.2012, 15:57 +0200 schrieb Terje Bergström:
> On 28.11.2012 15:33, Lucas Stach wrote:
> > So this is obviously wrong. Userspace has to allocate a pushbuffer from
> > the kernel just as every other buffer, then map it into it's own address
> > space to push in commands. At submit time of the pushbuf kernel has to
> > make sure that userspace is not able to access the memory any more, i.e.
> > kernel shoots down the vma or pagetable of the vma. To keep overhead low
> > and not do any blocking you can just keep some pushbufs around for one
> > channel and switch over the pagetable entries to the next free buffer,
> > just make sure that userspace is never able to tamper with a buffer as
> > long as the gpu isn't done with it.
> 
> That's really not something dma-buf APIs are equipped to handle. We need
> something to ensure user space doesn't have the buffer mapped (either
> return error if has, or zap the mapping), something to ensure user space
> cannot mmap the buffer, and something to revert this all once we're done.
> 
> We could add these as special ops to tegradrm dmabuf code for now, and
> assume that command streams are always allocated by tegradrm. Now we
> allow any dmabuf to be used as buffers for command streams.
> 
Why do even need/use dma-buf for this use case? This is all one DRM
device, even if we separate host1x and gr2d as implementation modules.

So standard way of doing this is:
1. create gem object for pushbuffer
2. create fake mmap offset for gem obj
3. map pushbuf using the fake offset on the drm device
4. at submit time zap the mapping

You need this logic anyway, as normally we don't rely on userspace to
sync gpu and cpu, but use the kernel to handle the concurrency issues.

Regards,
Lucas
Terje Bergstrom Nov. 28, 2012, 2:45 p.m. UTC | #18
On 28.11.2012 16:06, Lucas Stach wrote:
> Why do even need/use dma-buf for this use case? This is all one DRM
> device, even if we separate host1x and gr2d as implementation modules.

I didn't want to implement dependency to drm gem objects in nvhost, but
we have thought about doing that. dma-buf brings quite a lot of
overhead, so implementing support for gem buffers would make the
sequence a bit leaner.

nvhost already has infra to support multiple memory managers.

> So standard way of doing this is:
> 1. create gem object for pushbuffer
> 2. create fake mmap offset for gem obj
> 3. map pushbuf using the fake offset on the drm device
> 4. at submit time zap the mapping
> 
> You need this logic anyway, as normally we don't rely on userspace to
> sync gpu and cpu, but use the kernel to handle the concurrency issues.

Taking a step back - 2D streams are actually very short, in the order of
<100 bytes. Just copying them to kernel space would actually be faster
than doing MMU operations.

I think for Tegra20 and non-IOMMU case, we just need to copy the command
stream to kernel buffer. In Tegra30 IOMMU case reference to user space
buffers are fine, as tampering the streams doesn't have any ill effects.

Terje
Lucas Stach Nov. 28, 2012, 3:13 p.m. UTC | #19
Am Mittwoch, den 28.11.2012, 16:45 +0200 schrieb Terje Bergström:
> On 28.11.2012 16:06, Lucas Stach wrote:
> > Why do even need/use dma-buf for this use case? This is all one DRM
> > device, even if we separate host1x and gr2d as implementation modules.
> 
> I didn't want to implement dependency to drm gem objects in nvhost, but
> we have thought about doing that. dma-buf brings quite a lot of
> overhead, so implementing support for gem buffers would make the
> sequence a bit leaner.
> 
> nvhost already has infra to support multiple memory managers.
> 
To be honest I still don't grok all of this, but nonetheless I try my
best.

Anyway, shouldn't nvhost be something like an allocator used by host1x
clients? With the added ability to do relocs/binding of buffers into
client address spaces, refcounting buffers and import/export dma-bufs?
In this case nvhost objects would just be used to back DRM GEM objects.
If using GEM objects in the DRM driver introduces any cross dependencies
with nvhost, you should take a step back and ask yourself if the current
design is the right way to go.

> > So standard way of doing this is:
> > 1. create gem object for pushbuffer
> > 2. create fake mmap offset for gem obj
> > 3. map pushbuf using the fake offset on the drm device
> > 4. at submit time zap the mapping
> > 
> > You need this logic anyway, as normally we don't rely on userspace to
> > sync gpu and cpu, but use the kernel to handle the concurrency issues.
> 
> Taking a step back - 2D streams are actually very short, in the order of
> <100 bytes. Just copying them to kernel space would actually be faster
> than doing MMU operations.
> 
Is this always the case because of the limited abilities of the gr2d
engine, or is it just your current driver flushing the stream very
often?

> I think for Tegra20 and non-IOMMU case, we just need to copy the command
> stream to kernel buffer. In Tegra30 IOMMU case reference to user space
> buffers are fine, as tampering the streams doesn't have any ill effects.
> 
In which way is it a good design choice to let the CPU happily alter
_any_ buffer the GPU is busy processing without getting the concurrency
right?

Please keep in mind that the interfaces you are now trying to introduce
have to be supported for virtually unlimited time. You might not be able
to scrub your mistakes later on without going through a lot of hassles.

To avoid a lot of those mistakes it might be a good idea to look at how
other drivers use the DRM infrastructure and only part from those proven
schemes where really necessary/worthwhile.

Regards,
Lucas
Terje Bergstrom Nov. 28, 2012, 4:23 p.m. UTC | #20
On 28.11.2012 17:13, Lucas Stach wrote:
> To be honest I still don't grok all of this, but nonetheless I try my
> best.

Sorry. I promised in another thread a write-up explaining the design. I
still owe you guys that.

> Anyway, shouldn't nvhost be something like an allocator used by host1x
> clients? With the added ability to do relocs/binding of buffers into
> client address spaces, refcounting buffers and import/export dma-bufs?
> In this case nvhost objects would just be used to back DRM GEM objects.
> If using GEM objects in the DRM driver introduces any cross dependencies
> with nvhost, you should take a step back and ask yourself if the current
> design is the right way to go.

tegradrm has the GEM allocator, and tegradrm contains the 2D kernel
interface. tegradrm contains a dma-buf exporter for the tegradrm GEM
objects.

nvhost accepts jobs from tegradrm's 2D driver. nvhost increments
refcounts and maps the command stream and target memories to devices,
maps the command streams to kernel memory, replaces the placeholders in
command streams with addresses with device virtual addresses, and unmaps
the buffer from kernel memory. nvhost uses dma buf APIs for all of the
memory operations, and relies on dmabuf for refcounting. After all this
the command streams are pushed to host1x push buffer as GATHER (kind of
a "gosub") opcodes, which reference to the command streams.

Once the job is done, nvhost decrements refcounts and updates pushbuffer
pointers.

The design is done so that nvhost won't be DRM specific. I want to
enable creating V4L2 etc interfaces that talk to other host1x clients.
V4L2 (yeah, I know nothing of V4L2) could pass frames via nvhost to EPP
for pixel format conversion or 2D for rotation and write result to frame
buffer.

Do you think there's some fundamental problem with this design?

>> Taking a step back - 2D streams are actually very short, in the order of
>> <100 bytes. Just copying them to kernel space would actually be faster
>> than doing MMU operations.
>>
> Is this always the case because of the limited abilities of the gr2d
> engine, or is it just your current driver flushing the stream very
> often?

It's because of limited abilities of the hardware. It just doesn't take
that many operations to invoke 2D.

The libdrm user space we're created flushes probably a bit too often
now, but even in downstream the streams are not much longer.  It takes
still at least a week to get the user space code out for you to look at.

> In which way is it a good design choice to let the CPU happily alter
> _any_ buffer the GPU is busy processing without getting the concurrency
> right?

Concurrency is handled with sync points. User space will know when a
command stream is processed and can be reused by comparing the current
sync point value, and the fence that 2D driver returned to user space.
User space can have a pool of buffers and can recycle when it knows it
can do so. But, this is not enforced by kernel.

The difference with your proposal and what I posted is the level of
control user space has over its command stream management. But as said,
2D streams are so short that my guess is that there's not too much
penalty copying it to kernel managed host1x push buffer directly instead
of inserting a GATHER reference.

> Please keep in mind that the interfaces you are now trying to introduce
> have to be supported for virtually unlimited time. You might not be able
> to scrub your mistakes later on without going through a lot of hassles.
> 
> To avoid a lot of those mistakes it might be a good idea to look at how
> other drivers use the DRM infrastructure and only part from those proven
> schemes where really necessary/worthwhile.

Yep, as the owner of this driver downstream, I'm also leveraging my
experience with the graphics stack in our downstream software stack that
is accessible via f.ex. L4T.

This is exactly the discussion we should be having, and I'm learning all
the time, so let's continue tossing around ideas until we're both happy
with the result.

Terje
Stephen Warren Nov. 28, 2012, 4:24 p.m. UTC | #21
On 11/28/2012 07:45 AM, Terje Bergström wrote:
> On 28.11.2012 16:06, Lucas Stach wrote:
>> Why do even need/use dma-buf for this use case? This is all one DRM
>> device, even if we separate host1x and gr2d as implementation modules.
> 
> I didn't want to implement dependency to drm gem objects in nvhost, but
> we have thought about doing that. dma-buf brings quite a lot of
> overhead, so implementing support for gem buffers would make the
> sequence a bit leaner.
> 
> nvhost already has infra to support multiple memory managers.
> 
>> So standard way of doing this is:
>> 1. create gem object for pushbuffer
>> 2. create fake mmap offset for gem obj
>> 3. map pushbuf using the fake offset on the drm device
>> 4. at submit time zap the mapping
>>
>> You need this logic anyway, as normally we don't rely on userspace to
>> sync gpu and cpu, but use the kernel to handle the concurrency issues.
> 
> Taking a step back - 2D streams are actually very short, in the order of
> <100 bytes. Just copying them to kernel space would actually be faster
> than doing MMU operations.

I'm not sure it's a good idea to have one buffer submission mechanism
for the 2D class and another for the 3D/... class, nor to bet that 2D
streams will always be short.
Lucas Stach Nov. 28, 2012, 6:46 p.m. UTC | #22
Am Mittwoch, den 28.11.2012, 18:23 +0200 schrieb Terje Bergström:
> On 28.11.2012 17:13, Lucas Stach wrote:
> > To be honest I still don't grok all of this, but nonetheless I try my
> > best.
> 
> Sorry. I promised in another thread a write-up explaining the design. I
> still owe you guys that.
> 
That would be really nice to have. I'm also particularly interested in
how you plan to do synchronization of command streams to different
engines working together, if that's not too much to ask for now. Like
userspace uploading a texture in a buffer, 2D engine doing mipmap
generation, 3D engine using mipmapped texture.

> > Anyway, shouldn't nvhost be something like an allocator used by host1x
> > clients? With the added ability to do relocs/binding of buffers into
> > client address spaces, refcounting buffers and import/export dma-bufs?
> > In this case nvhost objects would just be used to back DRM GEM objects.
> > If using GEM objects in the DRM driver introduces any cross dependencies
> > with nvhost, you should take a step back and ask yourself if the current
> > design is the right way to go.
> 
> tegradrm has the GEM allocator, and tegradrm contains the 2D kernel
> interface. tegradrm contains a dma-buf exporter for the tegradrm GEM
> objects.
> 
> nvhost accepts jobs from tegradrm's 2D driver. nvhost increments
> refcounts and maps the command stream and target memories to devices,
> maps the command streams to kernel memory, replaces the placeholders in
> command streams with addresses with device virtual addresses, and unmaps
> the buffer from kernel memory. nvhost uses dma buf APIs for all of the
> memory operations, and relies on dmabuf for refcounting. After all this
> the command streams are pushed to host1x push buffer as GATHER (kind of
> a "gosub") opcodes, which reference to the command streams.
> 
> Once the job is done, nvhost decrements refcounts and updates pushbuffer
> pointers.
> 
> The design is done so that nvhost won't be DRM specific. I want to
> enable creating V4L2 etc interfaces that talk to other host1x clients.
> V4L2 (yeah, I know nothing of V4L2) could pass frames via nvhost to EPP
> for pixel format conversion or 2D for rotation and write result to frame
> buffer.
> 
> Do you think there's some fundamental problem with this design?
> 
Ah yes I see. So if we consider nvhost to be the central entity in
charge of controlling all host1x clients and tegradrm as the interface
that happens to bundle display, 2d and 3d engine functionality into it's
interface we should probably aim for two things:
1. Move everything needed by all engines down into nvhost (I especially
see the allocator falling under this point, I'll explain why this would
be beneficial a bit later)

2. Move the exposed DRM interface more in line with other DRM drivers.
Please take a look at how for example the GEM_EXECBUF ioctl works on
other drivers to get a feeling of what I'm talking about. Everything
using the display, 2D and maybe later on the 3D engine should only deal
with GEM handles. I really don't like the idea of having a single
userspace application, which uses engines with similar and known
requirements (DDX) dealing with dma-buf handles or other similar high
overhead stuff to do the most basic tasks. 

If we move down the allocator into nvhost we can use buffers allocated
from this to back GEM or V4L2 buffers transparently. The ioctl to
allocate a GEM buffer shouldn't do much more than wrapping the nvhost
buffer.
This may also solve your problem with having multiple mappings of the
same buffer into the very same address space, as nvhost is the single
instance that manages all host1x client address spaces. If the buffer is
originating from there you can easily check if it's already mapped. For
Tegra 3 to do things in an efficient way we likely have to move away
from dealing with the DMA API to dealing with the IOMMU API, this gets a
_lot_ easier_ if you have a single point where you manage memory
allocation and address space.

dma-buf should only be used where userspace is dealing with devices that
get controlled by different interfaces, like pointing a display plane to
some camera buffer. And even then with a single allocator for the host1x
clients an dma-buf import is nothing more than realizing that the fd is
one of the fds you exported yourself, so you can go and look it up and
then depending on the device you are on just pointing the engine at the
memory location or fixing up the iommu mapping.

> >> Taking a step back - 2D streams are actually very short, in the order of
> >> <100 bytes. Just copying them to kernel space would actually be faster
> >> than doing MMU operations.
> >>
> > Is this always the case because of the limited abilities of the gr2d
> > engine, or is it just your current driver flushing the stream very
> > often?
> 
> It's because of limited abilities of the hardware. It just doesn't take
> that many operations to invoke 2D.
> 
> The libdrm user space we're created flushes probably a bit too often
> now, but even in downstream the streams are not much longer.  It takes
> still at least a week to get the user space code out for you to look at.
> 
That's no problem, as I may not be able to do in-depth reviews of any
code until next week.

> > In which way is it a good design choice to let the CPU happily alter
> > _any_ buffer the GPU is busy processing without getting the concurrency
> > right?
> 
> Concurrency is handled with sync points. User space will know when a
> command stream is processed and can be reused by comparing the current
> sync point value, and the fence that 2D driver returned to user space.
> User space can have a pool of buffers and can recycle when it knows it
> can do so. But, this is not enforced by kernel.
> 
This is the point where we have differ: You have to deal with syncpts in
kernel anyway, otherwise you don't know when it's safe to destroy a
buffer. And no, userspace should not have the ability to destroy a
buffer itself, userspace should always just be able to free it's
reference to the buffer. Remember: never trust the userspace. And if you
are dealing with syncpts in kernel anyway, you can just go ahead and
enforce some sane concurrency rules. There may be some corener cases
related to userspace suballocating a kernel buffer, which might need
some more thought still, but that's not a valid excuse to not do any
concurrency validation in kernel.

> The difference with your proposal and what I posted is the level of
> control user space has over its command stream management. But as said,
> 2D streams are so short that my guess is that there's not too much
> penalty copying it to kernel managed host1x push buffer directly instead
> of inserting a GATHER reference.
> 
This an implementation detail. Whether you shoot down the old pushbuf
mapping and insert a new one pointing to free backing memory (which may
be the way to go for 3D) or do an immediate copy of the channel pushbuf
contents to the host1x pushbuf (which may be beneficial for very small
pushs) is all the same. Both methods implicitly guarantee that the
memory mapped by userspace always points to a location the CPU can write
to without interfering with the GPU.

> > Please keep in mind that the interfaces you are now trying to introduce
> > have to be supported for virtually unlimited time. You might not be able
> > to scrub your mistakes later on without going through a lot of hassles.
> > 
> > To avoid a lot of those mistakes it might be a good idea to look at how
> > other drivers use the DRM infrastructure and only part from those proven
> > schemes where really necessary/worthwhile.
> 
> Yep, as the owner of this driver downstream, I'm also leveraging my
> experience with the graphics stack in our downstream software stack that
> is accessible via f.ex. L4T.
> 
> This is exactly the discussion we should be having, and I'm learning all
> the time, so let's continue tossing around ideas until we're both happy
> with the result.
> 
I really enjoyed the discussion so far and hope we can get to the point
where we have a nice design/interface, working together.

Regards,
Lucas
Thomas Hellström (VMware) Nov. 28, 2012, 8:53 p.m. UTC | #23
On 11/28/2012 02:33 PM, Lucas Stach wrote:
> Am Mittwoch, den 28.11.2012, 15:17 +0200 schrieb Terje Bergström:
>> On 28.11.2012 01:00, Dave Airlie wrote:
>>> We  generally aim for the first, to stop the gpu from reading/writing
>>> any memory it hasn't been granted access to,
>>> the second is nice to have though, but really requires a GPU with VM
>>> to implement properly.
>> I wonder if we should aim at root only access on Tegra20, and force
>> IOMMU on Tegra30 and fix the remaining issues we have with IOMMU. The
>> firewall turns out to be more complicated than I wished.
>>
>> Biggest problem is that we aim at zero-copy for everything possible,
>> including command streams. Kernel gets a handle to a command stream, but
>> the command stream is allocated by the user space process. So the user
>> space can tamper with the stream once it's been written to the host1x 2D
>> channel.
>>
> So this is obviously wrong. Userspace has to allocate a pushbuffer from
> the kernel just as every other buffer, then map it into it's own address
> space to push in commands. At submit time of the pushbuf kernel has to
> make sure that userspace is not able to access the memory any more, i.e.
> kernel shoots down the vma or pagetable of the vma.

To me this sounds very expensive. Zapping the page table requires a CPU 
TLB flush
on all cores that have touched the buffer, not to mention the kernel calls
required to set up the page table once the buffer is reused.

If this usage scheme then is combined with a command verifier or 
"firewall" that
reads from a *write-combined* pushbuffer performance will be bad. Really 
bad.

In such situations I think one should consider copy-from-user while 
validating, and
let user-space set up the command buffer in malloced memory.

/Thomas
Mark Zhang Nov. 29, 2012, 7:37 a.m. UTC | #24
On 11/26/2012 09:19 PM, Terje Bergström <tbergstrom@nvidia.com> wrote:
> Add client driver for 2D device.
> 
> Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
> Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
> 
> ---
> 
[...]
> +
> +static int
> +tegra_drm_ioctl_open_channel(struct drm_device *drm, void *data,
> +			 struct drm_file *file_priv)
> +{
> +	struct tegra_drm_open_channel_args *args = data;
> +	struct tegra_drm_client *client;
> +	struct tegra_drm_context *context;
> +	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
> +	int err = 0;
> +
> +	dev_dbg(drm->dev, "> %s(fpriv=%p, class=%x)\n", __func__,
> +			fpriv, args->class);
> +
> +	context = kzalloc(sizeof(*context), GFP_KERNEL);
> +	if (!context) {
> +		err = -ENOMEM;
> +		goto out;

Change to "return -ENOMEM". Otherwise the NULL "context" will be kfree.

> +	}
> +
> +	list_for_each_entry(client, &tegra_drm_subdrv_list, list) {
> +		if (client->class == args->class) {
> +			dev_dbg(drm->dev, "opening client %x\n", args->class);
> +			context->client = client;
> +			err = client->ops->open_channel(client, context);
> +			if (err)
> +				goto out;
> +
> +			dev_dbg(drm->dev, "context %p\n", context);
> +			list_add(&context->list, &fpriv->contexts);
> +			args->context = context;
> +			goto out;
> +		}
> +	}
> +	err = -ENODEV;
> +
> +out:
> +	if (err)
> +		kfree(context);
> +
> +	dev_dbg(drm->dev, "< %s() = %d\n", __func__, err);
> +	return err;
> +}
> +
> +static int
> +tegra_drm_ioctl_close_channel(struct drm_device *drm, void *data,
> +			 struct drm_file *file_priv)
> +{
> +	struct tegra_drm_open_channel_args *args = data;
> +	struct tegra_drm_context *context;
> +	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
> +	int err = 0;
> +
> +	dev_dbg(drm->dev, "> %s(fpriv=%p)\n", __func__, fpriv);
> +	list_for_each_entry(context, &fpriv->contexts, list) {

Consider "list_for_each_entry_safe" cause you remove list members during
the loop.

> +		if (context == args->context) {
> +			context->client->ops->close_channel(context);
> +			list_del(&context->list);
> +			kfree(context);
> +			goto out;
> +		}
> +	}
> +	err = -EINVAL;
> +
> +out:
> +	dev_dbg(drm->dev, "< %s() = %d\n", __func__, err);
> +	return err;
> +}
> +
[...]
> +
> +static int gr2d_submit(struct tegra_drm_context *context,
> +		struct tegra_drm_submit_args *args)

I'm still in the middle of code reading of job submit, so I'll get you
back if I find something.

[...]
> +
> +static struct of_device_id gr2d_match[] __devinitdata = {
> +	{ .compatible = "nvidia,tegra20-gr2d", },
> +	{ .compatible = "nvidia,tegra30-gr2d", },

Just as swarren mentioned, you'd better place "nvidia,tegra30-gr2d" in
the front of "nvidia,tegra20-gr2d"...

[...]
> +
> +#define DRM_TEGRA_GEM_CREATE		0x00
> +
> +#define DRM_IOCTL_TEGRA_GEM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + \
> +		DRM_TEGRA_GEM_CREATE, struct tegra_gem_create)
> +

Just a very minor suggestion: could you put them at the end of this
file? I mean, stay with other IOCTLs like SYNCPT_READ, SYNCPT_WAIT...

[...]
> +
> +#define DRM_TEGRA_DRM_SYNCPT_READ 0x01
> +#define DRM_TEGRA_DRM_SYNCPT_INCR 0x02
> +#define DRM_TEGRA_DRM_SYNCPT_WAIT 0x03
> +#define DRM_TEGRA_DRM_OPEN_CHANNEL 0x04
> +#define DRM_TEGRA_DRM_CLOSE_CHANNEL 0x05
> +#define DRM_TEGRA_DRM_GET_SYNCPOINTS 0x06
> +#define DRM_TEGRA_DRM_GET_MODMUTEXES 0x07
> +#define DRM_TEGRA_DRM_SUBMIT 0x08
> +
> +#define DRM_IOCTL_TEGRA_DRM_SYNCPT_READ DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_READ, struct tegra_drm_syncpt_read_args)
> +#define DRM_IOCTL_TEGRA_DRM_SYNCPT_INCR DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_INCR, struct tegra_drm_syncpt_incr_args)
> +#define DRM_IOCTL_TEGRA_DRM_SYNCPT_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_WAIT, struct tegra_drm_syncpt_wait_args)
> +#define DRM_IOCTL_TEGRA_DRM_OPEN_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_OPEN_CHANNEL, struct tegra_drm_open_channel_args)
> +#define DRM_IOCTL_TEGRA_DRM_CLOSE_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_CLOSE_CHANNEL, struct tegra_drm_open_channel_args)
> +#define DRM_IOCTL_TEGRA_DRM_GET_SYNCPOINTS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_GET_SYNCPOINTS, struct tegra_drm_get_channel_param_args)
> +#define DRM_IOCTL_TEGRA_DRM_GET_MODMUTEXES DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_GET_MODMUTEXES, struct tegra_drm_get_channel_param_args)
> +#define DRM_IOCTL_TEGRA_DRM_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SUBMIT, struct tegra_drm_submit_args)
> +
> +#endif
>
Terje Bergstrom Nov. 29, 2012, 8:17 a.m. UTC | #25
On 28.11.2012 20:46, Lucas Stach wrote:
> Am Mittwoch, den 28.11.2012, 18:23 +0200 schrieb Terje Bergström:
>> Sorry. I promised in another thread a write-up explaining the design. I
>> still owe you guys that.
> That would be really nice to have. I'm also particularly interested in
> how you plan to do synchronization of command streams to different
> engines working together, if that's not too much to ask for now. Like
> userspace uploading a texture in a buffer, 2D engine doing mipmap
> generation, 3D engine using mipmapped texture.

I can briefly explain (and then copy-paste to a coherent text once I get
to it) how inter-engine synchronization is done. It's not specifically
for 2D or 3D, but generic to any host1x client.

Sync point register is a counter that can only increase. It starts from
0 and is incremented by a host1x client or CPU. host1x can freeze a
channel until a sync point value is reached, and it can trigger an
interrupt upon reaching a threshold. On Tegra2 and Tegra3 we have 32
sync points.

host1x clients all implement a method for incrementing a sync point
based on a condition, and on all of them (well, not entirely true) the
register is number 0. The most used condition is op_done, telling the
client to increment sync point once the previous operations are done.

In kernel, we keep track of the active range of sync point values, i.e.
ones we expect to be reached with the active jobs. Active range's
minimum is the current value read from hw and shadowed in memory. At job
submit time, kernel increments the maximum by the number of sync points
the stream announces it will perform. After performing the increment, we
have a number, which the sync point is supposed to reach at the end of
submit. That number is the the fence and it is recorded in kernel and
returned to user space.

So, when user space flushes, it receives a fence. It can insert the
fence into another command stream as parameter to a host1x channel wait.
This makes that channel freeze until an operation in another channel is
finished. That's how different host1x clients can synchronize without
using CPU.

Kernel's sync point wait essentially puts the process into sleep until
host1x sends an interrupt and we determine the value that a process is
waiting for, has been reached.

On top of this, we guard against wrapping issues by nulling out any sync
point waits (CPU or inside stream) that are waiting for values outside
the active range, and we have a timeout for jobs so that we can kick out
misbehaving command streams.

> Ah yes I see. So if we consider nvhost to be the central entity in
> charge of controlling all host1x clients and tegradrm as the interface
> that happens to bundle display, 2d and 3d engine functionality into it's
> interface we should probably aim for two things:
> 1. Move everything needed by all engines down into nvhost (I especially
> see the allocator falling under this point, I'll explain why this would
> be beneficial a bit later)

Ok. This is almost the current design, except for the allocator.

> 2. Move the exposed DRM interface more in line with other DRM drivers.
> Please take a look at how for example the GEM_EXECBUF ioctl works on
> other drivers to get a feeling of what I'm talking about. Everything
> using the display, 2D and maybe later on the 3D engine should only deal
> with GEM handles. I really don't like the idea of having a single
> userspace application, which uses engines with similar and known
> requirements (DDX) dealing with dma-buf handles or other similar high
> overhead stuff to do the most basic tasks.
> If we move down the allocator into nvhost we can use buffers allocated
> from this to back GEM or V4L2 buffers transparently. The ioctl to
> allocate a GEM buffer shouldn't do much more than wrapping the nvhost
> buffer.

Ok, this is actually what we do downstream. We use dma-buf handles only
for purposes where they're really needed (in fact, none yet), and use
our downstream allocator handles for the rest. I did this, because
benchmarks were showing that memory management overhead shoot through
the roof if I tried doing everything via dma-buf.

We can move support for allocating GEM handles to nvhost, and GEM
handles can be treated just as another memory handle type in nvhost.
tegradrm would then call nvhost for allocation.


> This may also solve your problem with having multiple mappings of the
> same buffer into the very same address space, as nvhost is the single
> instance that manages all host1x client address spaces. If the buffer is
> originating from there you can easily check if it's already mapped. For
> Tegra 3 to do things in an efficient way we likely have to move away
> from dealing with the DMA API to dealing with the IOMMU API, this gets a
> _lot_ easier_ if you have a single point where you manage memory
> allocation and address space.

Yep, this would definitely simplify our IOMMU problem. But, I thought
the canonical way of dealing with device memory is DMA API, and you're
saying that we should just bypass it and call IOMMU directly?

>> Concurrency is handled with sync points. User space will know when a
>> command stream is processed and can be reused by comparing the current
>> sync point value, and the fence that 2D driver returned to user space.
>> User space can have a pool of buffers and can recycle when it knows it
>> can do so. But, this is not enforced by kernel.
>>
> This is the point where we have differ: You have to deal with syncpts in
> kernel anyway, otherwise you don't know when it's safe to destroy a
> buffer. And no, userspace should not have the ability to destroy a
> buffer itself, userspace should always just be able to free it's
> reference to the buffer. Remember: never trust the userspace. And if you
> are dealing with syncpts in kernel anyway, you can just go ahead and
> enforce some sane concurrency rules. There may be some corener cases
> related to userspace suballocating a kernel buffer, which might need
> some more thought still, but that's not a valid excuse to not do any
> concurrency validation in kernel.

nvhost is already dealing with sync points, and protecting memory from
being freed if it's used. We use refcounting to do that. When a job is
sent to hw, we get reference to all memory (command stream & surfaces).
When job is done (fence reached), nvhost unreferences them. User space
can free the memory it has allocated, but kernel would hold on to it
until it's safe to actually release the memory.

>> The difference with your proposal and what I posted is the level of
>> control user space has over its command stream management. But as said,
>> 2D streams are so short that my guess is that there's not too much
>> penalty copying it to kernel managed host1x push buffer directly instead
>> of inserting a GATHER reference.
>>
> This an implementation detail. Whether you shoot down the old pushbuf
> mapping and insert a new one pointing to free backing memory (which may
> be the way to go for 3D) or do an immediate copy of the channel pushbuf
> contents to the host1x pushbuf (which may be beneficial for very small
> pushs) is all the same. Both methods implicitly guarantee that the
> memory mapped by userspace always points to a location the CPU can write
> to without interfering with the GPU.

Ok. Based on this, I propose the way to go for cases without IOMMU
support and all Tegra20 cases (as Tegra20's GART can't provide memory
protection) is to copy the stream to host1x push buffer. In Tegra30 with
IOMMU support we can just reference the buffer. This way we don't have
to do expensive MMU operations.

> I really enjoyed the discussion so far and hope we can get to the point
> where we have a nice design/interface, working together.

Thanks. I don't have a strict deadline for upstreaming, so I'm fine
continuing with discussion until we're settled, and then doing the
needed changes.

My goal is to end up with something that we can take advantage of in
upstream with tegradrm and 2D, but also downstream with the rest of the
downstream stack. That way there's no technical barrier to us moving in
downstream to use the upstream code.

Terje
Lucas Stach Nov. 29, 2012, 9:09 a.m. UTC | #26
Am Donnerstag, den 29.11.2012, 10:17 +0200 schrieb Terje Bergström:
> On 28.11.2012 20:46, Lucas Stach wrote:
> > Am Mittwoch, den 28.11.2012, 18:23 +0200 schrieb Terje Bergström:
> >> Sorry. I promised in another thread a write-up explaining the design. I
> >> still owe you guys that.
> > That would be really nice to have. I'm also particularly interested in
> > how you plan to do synchronization of command streams to different
> > engines working together, if that's not too much to ask for now. Like
> > userspace uploading a texture in a buffer, 2D engine doing mipmap
> > generation, 3D engine using mipmapped texture.
> 
> I can briefly explain (and then copy-paste to a coherent text once I get
> to it) how inter-engine synchronization is done. It's not specifically
> for 2D or 3D, but generic to any host1x client.
[...]
Thanks for that.
[...]

> > 2. Move the exposed DRM interface more in line with other DRM drivers.
> > Please take a look at how for example the GEM_EXECBUF ioctl works on
> > other drivers to get a feeling of what I'm talking about. Everything
> > using the display, 2D and maybe later on the 3D engine should only deal
> > with GEM handles. I really don't like the idea of having a single
> > userspace application, which uses engines with similar and known
> > requirements (DDX) dealing with dma-buf handles or other similar high
> > overhead stuff to do the most basic tasks.
> > If we move down the allocator into nvhost we can use buffers allocated
> > from this to back GEM or V4L2 buffers transparently. The ioctl to
> > allocate a GEM buffer shouldn't do much more than wrapping the nvhost
> > buffer.
> 
> Ok, this is actually what we do downstream. We use dma-buf handles only
> for purposes where they're really needed (in fact, none yet), and use
> our downstream allocator handles for the rest. I did this, because
> benchmarks were showing that memory management overhead shoot through
> the roof if I tried doing everything via dma-buf.
> 
> We can move support for allocating GEM handles to nvhost, and GEM
> handles can be treated just as another memory handle type in nvhost.
> tegradrm would then call nvhost for allocation.
> 
We should aim for a clean split here. GEM handles are something which is
really specific to how DRM works and as such should be constructed by
tegradrm. nvhost should really just manage allocations/virtual address
space and provide something that is able to back all the GEM handle
operations.

nvhost has really no reason at all to even know about GEM handles. If
you back a GEM object by a nvhost object you can just peel out the
nvhost handles from the GEM wrapper in the tegradrm submit ioctl handler
and queue the job to nvhost using it's native handles.

This way you would also be able to construct different handles (like GEM
obj or V4L2 buffers) from the same backing nvhost object. Note that I'm
not sure how useful this would be, but it seems like a reasonable design
to me being able to do so.

> > This may also solve your problem with having multiple mappings of the
> > same buffer into the very same address space, as nvhost is the single
> > instance that manages all host1x client address spaces. If the buffer is
> > originating from there you can easily check if it's already mapped. For
> > Tegra 3 to do things in an efficient way we likely have to move away
> > from dealing with the DMA API to dealing with the IOMMU API, this gets a
> > _lot_ easier_ if you have a single point where you manage memory
> > allocation and address space.
> 
> Yep, this would definitely simplify our IOMMU problem. But, I thought
> the canonical way of dealing with device memory is DMA API, and you're
> saying that we should just bypass it and call IOMMU directly?
> 
This is true for all standard devices. But we should not consider this
as something set in stone and then building some crufty design around
it. If we can manage to make our design a lot cleaner by managing DMA
memory and the corresponding IOMMU address spaces for the host1x devices
ourselves, I think this is the way to go. All other graphics drivers in
the Linux kernel have to deal with their GTT in some way, we just happen
to do so by using a shared system IOMMU and not something that is
exclusive to the graphics devices.

This is more work on the side of nvhost, but IMHO the benefits make it
look worthwhile.

What we should avoid is something that completely escapes the standard
ways of dealing with memory used in the Linux kernel, like using
carveout areas, but I think this is already consensus among us all.

[...]
> > This an implementation detail. Whether you shoot down the old pushbuf
> > mapping and insert a new one pointing to free backing memory (which may
> > be the way to go for 3D) or do an immediate copy of the channel pushbuf
> > contents to the host1x pushbuf (which may be beneficial for very small
> > pushs) is all the same. Both methods implicitly guarantee that the
> > memory mapped by userspace always points to a location the CPU can write
> > to without interfering with the GPU.
> 
> Ok. Based on this, I propose the way to go for cases without IOMMU
> support and all Tegra20 cases (as Tegra20's GART can't provide memory
> protection) is to copy the stream to host1x push buffer. In Tegra30 with
> IOMMU support we can just reference the buffer. This way we don't have
> to do expensive MMU operations.
> 
Sounds like a plan.

Regards,
Lucas
Thierry Reding Nov. 29, 2012, 12:14 p.m. UTC | #27
On Thu, Nov 29, 2012 at 10:09:13AM +0100, Lucas Stach wrote:
> Am Donnerstag, den 29.11.2012, 10:17 +0200 schrieb Terje Bergström:
> > On 28.11.2012 20:46, Lucas Stach wrote:
> > > Am Mittwoch, den 28.11.2012, 18:23 +0200 schrieb Terje Bergström:
> > >> Sorry. I promised in another thread a write-up explaining the design. I
> > >> still owe you guys that.
> > > That would be really nice to have. I'm also particularly interested in
> > > how you plan to do synchronization of command streams to different
> > > engines working together, if that's not too much to ask for now. Like
> > > userspace uploading a texture in a buffer, 2D engine doing mipmap
> > > generation, 3D engine using mipmapped texture.
> > 
> > I can briefly explain (and then copy-paste to a coherent text once I get
> > to it) how inter-engine synchronization is done. It's not specifically
> > for 2D or 3D, but generic to any host1x client.
> [...]
> Thanks for that.
> [...]
> 
> > > 2. Move the exposed DRM interface more in line with other DRM drivers.
> > > Please take a look at how for example the GEM_EXECBUF ioctl works on
> > > other drivers to get a feeling of what I'm talking about. Everything
> > > using the display, 2D and maybe later on the 3D engine should only deal
> > > with GEM handles. I really don't like the idea of having a single
> > > userspace application, which uses engines with similar and known
> > > requirements (DDX) dealing with dma-buf handles or other similar high
> > > overhead stuff to do the most basic tasks.
> > > If we move down the allocator into nvhost we can use buffers allocated
> > > from this to back GEM or V4L2 buffers transparently. The ioctl to
> > > allocate a GEM buffer shouldn't do much more than wrapping the nvhost
> > > buffer.
> > 
> > Ok, this is actually what we do downstream. We use dma-buf handles only
> > for purposes where they're really needed (in fact, none yet), and use
> > our downstream allocator handles for the rest. I did this, because
> > benchmarks were showing that memory management overhead shoot through
> > the roof if I tried doing everything via dma-buf.
> > 
> > We can move support for allocating GEM handles to nvhost, and GEM
> > handles can be treated just as another memory handle type in nvhost.
> > tegradrm would then call nvhost for allocation.
> > 
> We should aim for a clean split here. GEM handles are something which is
> really specific to how DRM works and as such should be constructed by
> tegradrm. nvhost should really just manage allocations/virtual address
> space and provide something that is able to back all the GEM handle
> operations.
> 
> nvhost has really no reason at all to even know about GEM handles. If
> you back a GEM object by a nvhost object you can just peel out the
> nvhost handles from the GEM wrapper in the tegradrm submit ioctl handler
> and queue the job to nvhost using it's native handles.

That certainly sounds sensible to me. We would obviously no longer be
able to reuse the CMA GEM helpers, but if it makes things easier to
handle in general that's definitely something we can live with.

If I understand this correctly it would also allow us to do the buffer
management within host1x and therefore allow the differences between
Tegra20 (CMA) and Tegra30 (IOMMU) allocations to be handled in one
central place. That would indeed make things a lot easier in the host1x
client drivers.

> This way you would also be able to construct different handles (like GEM
> obj or V4L2 buffers) from the same backing nvhost object. Note that I'm
> not sure how useful this would be, but it seems like a reasonable design
> to me being able to do so.

Wouldn't that be useful for sharing buffers between DRM and V4L2 using
dma-buf? I'm not very familiar with how exactly importing and exporting
work with dma-buf, so maybe I need to read up some more.

Thierry
Terje Bergstrom Nov. 29, 2012, 1:36 p.m. UTC | #28
On 29.11.2012 11:09, Lucas Stach wrote:
> We should aim for a clean split here. GEM handles are something which is
> really specific to how DRM works and as such should be constructed by
> tegradrm. nvhost should really just manage allocations/virtual address
> space and provide something that is able to back all the GEM handle
> operations.
> 
> nvhost has really no reason at all to even know about GEM handles. If
> you back a GEM object by a nvhost object you can just peel out the
> nvhost handles from the GEM wrapper in the tegradrm submit ioctl handler
> and queue the job to nvhost using it's native handles.
> 
> This way you would also be able to construct different handles (like GEM
> obj or V4L2 buffers) from the same backing nvhost object. Note that I'm
> not sure how useful this would be, but it seems like a reasonable design
> to me being able to do so.

Ok, I must say that I got totally surprised by this and almost fell off
the bench of the bus while commuting to home and reading this mail. On
the technical side, what you wrote makes perfect sense and we'll go
through this idea very carefully, so don't take me wrong.

What surprised me was that we had always assumed that nvmap, the
allocator we use in downstream kernel, would never be something that
would be accepted upstream, so we haven't done work at all on cleaning
it up and refactoring it for upstreaming, and cutting ties between
nvhost and nvmap. We assumed that we need to provide something that fit
into tegradrm and interacts with dma_buf and GEM, so we've written
something small that fulfills this need.

Now what you're suggesting is akin to getting a subset of nvmap into
picture. In downstream kernel it already takes care of all memory
management problems we've discussed wrt IOMMU (duplicate management,
different memory architectures, etc). But, it has a lot more than what
we need for now, so we'd need to decide if we go for importing parts of
nvmap as nvhost allocator, or use the allocator in the patchset I sent
earlier as basis.

>> Yep, this would definitely simplify our IOMMU problem. But, I thought
>> the canonical way of dealing with device memory is DMA API, and you're
>> saying that we should just bypass it and call IOMMU directly?
>>
> This is true for all standard devices. But we should not consider this
> as something set in stone and then building some crufty design around
> it. If we can manage to make our design a lot cleaner by managing DMA
> memory and the corresponding IOMMU address spaces for the host1x devices
> ourselves, I think this is the way to go. All other graphics drivers in
> the Linux kernel have to deal with their GTT in some way, we just happen
> to do so by using a shared system IOMMU and not something that is
> exclusive to the graphics devices.
> 
> This is more work on the side of nvhost, but IMHO the benefits make it
> look worthwhile.
> What we should avoid is something that completely escapes the standard
> ways of dealing with memory used in the Linux kernel, like using
> carveout areas, but I think this is already consensus among us all.

Makes perfect sense. I'll need to hash out a proposal on how to go about
this.

Terje
Terje Bergstrom Nov. 30, 2012, 7:44 a.m. UTC | #29
On 29.11.2012 14:14, Thierry Reding wrote:
> On Thu, Nov 29, 2012 at 10:09:13AM +0100, Lucas Stach wrote:
>> This way you would also be able to construct different handles (like GEM
>> obj or V4L2 buffers) from the same backing nvhost object. Note that I'm
>> not sure how useful this would be, but it seems like a reasonable design
>> to me being able to do so.
> 
> Wouldn't that be useful for sharing buffers between DRM and V4L2 using
> dma-buf? I'm not very familiar with how exactly importing and exporting
> work with dma-buf, so maybe I need to read up some more.

I would still preserve the dma-buf support, for exactly this purpose.

Terje
Lucas Stach Nov. 30, 2012, 7:53 a.m. UTC | #30
Am Freitag, den 30.11.2012, 09:44 +0200 schrieb Terje Bergström:
> On 29.11.2012 14:14, Thierry Reding wrote:
> > On Thu, Nov 29, 2012 at 10:09:13AM +0100, Lucas Stach wrote:
> >> This way you would also be able to construct different handles (like GEM
> >> obj or V4L2 buffers) from the same backing nvhost object. Note that I'm
> >> not sure how useful this would be, but it seems like a reasonable design
> >> to me being able to do so.
> > 
> > Wouldn't that be useful for sharing buffers between DRM and V4L2 using
> > dma-buf? I'm not very familiar with how exactly importing and exporting
> > work with dma-buf, so maybe I need to read up some more.
> 
> I would still preserve the dma-buf support, for exactly this purpose.
> 
dma-buf is useful and should be preserved, as some userspace like
gstreamer might rely on us being able to import/export dma-buf handles
at some time. At the very latest we'll need it if someone wants to run a
UDL device to scanout a buffer rendered to by the internal GPU.

What I'm saying is just that with a common allocator we could cut down a
lot on the usage of dma-buf, where not really necessary. Also you might
be able to do some optimisations based on the fact that a dma-buf handle
exported for some V4L2 buffer, which gets imported into DRM to construct
a GEM object, is the very same nvhost object in the end.

Regards,
Lucas
Mark Zhang Dec. 3, 2012, 9:30 a.m. UTC | #31
Hi Dave:

I'm new in kernel development. Could you tell me or give me some
materials to read that why we need to align the size of IOCTL structures
to 64bit? I can understand if we're working in a 64bit kernel but why we
need to do this if we're in a 32bit arm kernel? Besides, why the
pointers in IOCTL structure should be declared as u64?

Mark
On 11/27/2012 06:15 AM, Dave Airlie wrote:
>>  static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
>>  {
>> -       return 0;
>> +       struct tegra_drm_fpriv *fpriv;
>> +       int err = 0;
>> +
>> +       fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
>> +       if (!fpriv)
>> +               return -ENOMEM;
>> +
>> +       INIT_LIST_HEAD(&fpriv->contexts);
>> +       filp->driver_priv = fpriv;
>> +
> 
> who frees this?
>> +struct tegra_drm_syncpt_incr_args {
>> +       __u32 id;
>> +};
> 
> add 32-bits of padding here
> 
>> +
>> +struct tegra_drm_syncpt_wait_args {
>> +       __u32 id;
>> +       __u32 thresh;
>> +       __s32 timeout;
>> +       __u32 value;
>> +};
>> +
>> +#define DRM_TEGRA_NO_TIMEOUT   (-1)
>> +
>> +struct tegra_drm_open_channel_args {
>> +       __u32 class;
>> +       void *context;
> 
> no pointers use u64, align them to 64-bits, so 32-bits of padding,
> 
>> +};
>> +
>> +struct tegra_drm_get_channel_param_args {
>> +       void *context;
>> +       __u32 value;
> 
> Same padding + uint64_t for void *
> 
>> +};
>> +
>> +struct tegra_drm_syncpt_incr {
>> +       __u32 syncpt_id;
>> +       __u32 syncpt_incrs;
>> +};
>> +
>> +struct tegra_drm_cmdbuf {
>> +       __u32 mem;
>> +       __u32 offset;
>> +       __u32 words;
>> +};
> 
> add padding
>> +
>> +struct tegra_drm_reloc {
>> +       __u32 cmdbuf_mem;
>> +       __u32 cmdbuf_offset;
>> +       __u32 target;
>> +       __u32 target_offset;
>> +       __u32 shift;
>> +};
> 
> add padding
> 
>> +
>> +struct tegra_drm_waitchk {
>> +       __u32 mem;
>> +       __u32 offset;
>> +       __u32 syncpt_id;
>> +       __u32 thresh;
>> +};
>> +
>> +struct tegra_drm_submit_args {
>> +       void *context;
>> +       __u32 num_syncpt_incrs;
>> +       __u32 num_cmdbufs;
>> +       __u32 num_relocs;
>> +       __u32 submit_version;
>> +       __u32 num_waitchks;
>> +       __u32 waitchk_mask;
>> +       __u32 timeout;
>> +       struct tegra_drm_syncpt_incrs *syncpt_incrs;
>> +       struct tegra_drm_cmdbuf *cmdbufs;
>> +       struct tegra_drm_reloc *relocs;
>> +       struct tegra_drm_waitchk *waitchks;
>> +
>> +       __u32 pad[5];           /* future expansion */
>> +       __u32 fence;            /* Return value */
>> +};
> 
> lose all the pointers for 64-bit aligned uint64_t.
> 
> Probably should align all of these on __u64 and __u32 usage if possible.
> 
> i'll look at the rest of the patches, but I need to know what commands
> can be submitted via this interface and what are the security
> implications of it.
> 
> Dave.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Daniel Vetter Dec. 3, 2012, 9:40 a.m. UTC | #32
On Mon, Dec 3, 2012 at 10:30 AM, Mark Zhang <nvmarkzhang@gmail.com> wrote:
> I'm new in kernel development. Could you tell me or give me some
> materials to read that why we need to align the size of IOCTL structures
> to 64bit? I can understand if we're working in a 64bit kernel but why we
> need to do this if we're in a 32bit arm kernel? Besides, why the
> pointers in IOCTL structure should be declared as u64?

Because in a few years/months you'll have arm64, but still the same
driver with the same ioctls ... and if the ioctls are not _exactly_
the same you get to write compat ioctl code which copies the old 32bit
struct into the 64bit struct the kernel understands. Hence your ioctl
must be laid out exactly the same for both 32bit and 64bit, which
happens if you naturally align/pad everything to 64bits and only use
fixed-sized integers and no pointers.
-Daniel
Mark Zhang Dec. 4, 2012, 1:49 a.m. UTC | #33
On 12/03/2012 05:40 PM, Daniel Vetter wrote:
> On Mon, Dec 3, 2012 at 10:30 AM, Mark Zhang <nvmarkzhang@gmail.com> wrote:
>> I'm new in kernel development. Could you tell me or give me some
>> materials to read that why we need to align the size of IOCTL structures
>> to 64bit? I can understand if we're working in a 64bit kernel but why we
>> need to do this if we're in a 32bit arm kernel? Besides, why the
>> pointers in IOCTL structure should be declared as u64?
> 
> Because in a few years/months you'll have arm64, but still the same
> driver with the same ioctls ... and if the ioctls are not _exactly_
> the same you get to write compat ioctl code which copies the old 32bit
> struct into the 64bit struct the kernel understands. Hence your ioctl
> must be laid out exactly the same for both 32bit and 64bit, which
> happens if you naturally align/pad everything to 64bits and only use
> fixed-sized integers and no pointers.

Ah, I see. Thanks. Yes, u64 still works as 32 bit pointer.

Mark
> -Daniel
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 53ea383..5e85042 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -1,7 +1,7 @@ 
 ccflags-y := -Iinclude/drm
 ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
 
-tegra-drm-y := drm.o fb.o dc.o
+tegra-drm-y := drm.o fb.o dc.o gr2d.o
 tegra-drm-y += output.o rgb.o hdmi.o tvo.o dsi.o
 tegra-drm-y += plane.o dmabuf.o
 
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index f78a31b..c35e547 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -10,6 +10,7 @@ 
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
+#include <linux/nvhost.h>
 
 #include <mach/clk.h>
 #include <linux/dma-mapping.h>
@@ -55,10 +56,12 @@  static int tegra_drm_parse_dt(void)
 		"nvidia,tegra20-hdmi",
 		"nvidia,tegra20-tvo",
 		"nvidia,tegra20-dsi",
+		"nvidia,tegra20-gr2d",
 		"nvidia,tegra30-dc",
 		"nvidia,tegra30-hdmi",
 		"nvidia,tegra30-tvo",
-		"nvidia,tegra30-dsi"
+		"nvidia,tegra30-dsi",
+		"nvidia,tegra30-gr2d"
 	};
 	unsigned int i;
 	int err;
@@ -177,7 +180,17 @@  static int tegra_drm_unload(struct drm_device *drm)
 
 static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
 {
-	return 0;
+	struct tegra_drm_fpriv *fpriv;
+	int err = 0;
+
+	fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
+	if (!fpriv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&fpriv->contexts);
+	filp->driver_priv = fpriv;
+
+	return err;
 }
 
 static void tegra_drm_lastclose(struct drm_device *drm)
@@ -207,8 +220,13 @@  static int __init tegra_drm_init(void)
 	if (err < 0)
 		goto unregister_tvo;
 
+	err = platform_driver_register(&tegra_gr2d_driver);
+	if (err < 0)
+		goto unregister_dsi;
 	return 0;
 
+unregister_dsi:
+	platform_driver_unregister(&tegra_dsi_driver);
 unregister_tvo:
 	platform_driver_unregister(&tegra_tvo_driver);
 unregister_hdmi:
@@ -221,6 +239,7 @@  module_init(tegra_drm_init);
 
 static void __exit tegra_drm_exit(void)
 {
+	platform_driver_unregister(&tegra_gr2d_driver);
 	platform_driver_unregister(&tegra_dsi_driver);
 	platform_driver_unregister(&tegra_tvo_driver);
 	platform_driver_unregister(&tegra_hdmi_driver);
@@ -232,7 +251,215 @@  MODULE_AUTHOR("Thierry Reding <thierry.reding@avionic-design.de>");
 MODULE_DESCRIPTION("NVIDIA Tegra DRM driver");
 MODULE_LICENSE("GPL");
 
+static int
+tegra_drm_ioctl_syncpt_read(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_syncpt_read_args *args = data;
+
+	dev_dbg(drm->dev, "> %s(drm=%p, id=%d)\n", __func__, drm, args->id);
+	args->value = host1x_syncpt_read(args->id);
+	dev_dbg(drm->dev, "< %s(value=%d)\n", __func__, args->value);
+	return 0;
+}
+
+static int
+tegra_drm_ioctl_syncpt_incr(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_syncpt_incr_args *args = data;
+	dev_dbg(drm->dev, "> %s(drm=%p, id=%d)\n", __func__, drm, args->id);
+	host1x_syncpt_incr(args->id);
+	dev_dbg(drm->dev, "< %s()\n", __func__);
+	return 0;
+}
+
+static int
+tegra_drm_ioctl_syncpt_wait(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_syncpt_wait_args *args = data;
+	int err;
+
+	dev_dbg(drm->dev, "> %s(drm=%p, id=%d, thresh=%d)\n", __func__, drm,
+			args->id, args->thresh);
+	err = host1x_syncpt_wait(args->id, args->thresh,
+			args->timeout, &args->value);
+	dev_dbg(drm->dev, "< %s() = %d\n", __func__, err);
+
+	return err;
+}
+
+static int
+tegra_drm_ioctl_open_channel(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_open_channel_args *args = data;
+	struct tegra_drm_client *client;
+	struct tegra_drm_context *context;
+	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
+	int err = 0;
+
+	dev_dbg(drm->dev, "> %s(fpriv=%p, class=%x)\n", __func__,
+			fpriv, args->class);
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	list_for_each_entry(client, &tegra_drm_subdrv_list, list) {
+		if (client->class == args->class) {
+			dev_dbg(drm->dev, "opening client %x\n", args->class);
+			context->client = client;
+			err = client->ops->open_channel(client, context);
+			if (err)
+				goto out;
+
+			dev_dbg(drm->dev, "context %p\n", context);
+			list_add(&context->list, &fpriv->contexts);
+			args->context = context;
+			goto out;
+		}
+	}
+	err = -ENODEV;
+
+out:
+	if (err)
+		kfree(context);
+
+	dev_dbg(drm->dev, "< %s() = %d\n", __func__, err);
+	return err;
+}
+
+static int
+tegra_drm_ioctl_close_channel(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_open_channel_args *args = data;
+	struct tegra_drm_context *context;
+	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
+	int err = 0;
+
+	dev_dbg(drm->dev, "> %s(fpriv=%p)\n", __func__, fpriv);
+	list_for_each_entry(context, &fpriv->contexts, list) {
+		if (context == args->context) {
+			context->client->ops->close_channel(context);
+			list_del(&context->list);
+			kfree(context);
+			goto out;
+		}
+	}
+	err = -EINVAL;
+
+out:
+	dev_dbg(drm->dev, "< %s() = %d\n", __func__, err);
+	return err;
+}
+
+static int
+tegra_drm_ioctl_get_syncpoints(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_get_channel_param_args *args = data;
+	struct tegra_drm_context *context;
+	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
+	int err = 0;
+
+	list_for_each_entry(context, &fpriv->contexts, list) {
+		if (context == args->context) {
+			args->value =
+				context->client->ops->get_syncpoints(context);
+			goto out;
+		}
+	}
+	err = -ENODEV;
+
+out:
+	return err;
+}
+
+static int
+tegra_drm_ioctl_get_modmutexes(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_get_channel_param_args *args = data;
+	struct tegra_drm_context *context;
+	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
+	int err = 0;
+
+	list_for_each_entry(context, &fpriv->contexts, list) {
+		if (context == args->context) {
+			args->value =
+				context->client->ops->get_modmutexes(context);
+			goto out;
+		}
+	}
+	err = -ENODEV;
+
+out:
+	return err;
+}
+
+static int
+tegra_drm_ioctl_submit(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_drm_submit_args *args = data;
+	struct tegra_drm_context *context;
+	struct tegra_drm_fpriv *fpriv = tegra_drm_fpriv(file_priv);
+	int err = 0;
+
+	list_for_each_entry(context, &fpriv->contexts, list) {
+		if (context == args->context) {
+			err = context->client->ops->submit(context, args);
+			goto out;
+		}
+	}
+	err = -ENODEV;
+
+out:
+	return err;
+
+}
+
+static int
+tegra_drm_create_ioctl(struct drm_device *drm, void *data,
+			 struct drm_file *file_priv)
+{
+	struct tegra_gem_create *args = data;
+	struct drm_gem_cma_object *cma_obj;
+	int ret;
+
+	cma_obj = drm_gem_cma_create(drm, args->size);
+	if (IS_ERR(cma_obj))
+		goto err_cma_create;
+
+	ret = drm_gem_handle_create(file_priv, &cma_obj->base, &args->handle);
+	if (ret)
+		goto err_handle_create;
+
+	drm_gem_object_unreference(&cma_obj->base);
+
+	return 0;
+
+err_handle_create:
+	drm_gem_cma_free_object(&cma_obj->base);
+err_cma_create:
+	return -ENOMEM;
+}
+
 static struct drm_ioctl_desc tegra_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_drm_create_ioctl, DRM_UNLOCKED | DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_SYNCPT_READ, tegra_drm_ioctl_syncpt_read, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_SYNCPT_INCR, tegra_drm_ioctl_syncpt_incr, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_SYNCPT_WAIT, tegra_drm_ioctl_syncpt_wait, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_OPEN_CHANNEL, tegra_drm_ioctl_open_channel, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_CLOSE_CHANNEL, tegra_drm_ioctl_close_channel, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_GET_SYNCPOINTS, tegra_drm_ioctl_get_syncpoints, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_GET_MODMUTEXES, tegra_drm_ioctl_get_modmutexes, DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(TEGRA_DRM_SUBMIT, tegra_drm_ioctl_submit, DRM_UNLOCKED),
 };
 
 static const struct file_operations tegra_drm_fops = {
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 1267a38..db197f6 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -20,6 +20,7 @@ 
 #include <drm/drm_gem_cma_helper.h>
 #include <drm/drm_fb_cma_helper.h>
 #include <drm/drm_fixed.h>
+#include <drm/tegra_drm.h>
 
 struct tegra_framebuffer {
 	struct drm_framebuffer base;
@@ -33,17 +34,44 @@  static inline struct tegra_framebuffer *to_tegra_fb(struct drm_framebuffer *fb)
 
 struct tegra_drm_client;
 
+struct tegra_drm_context {
+	struct tegra_drm_client *client;
+	struct nvhost_channel *channel;
+	struct list_head list;
+};
+
 struct tegra_drm_client_ops {
-	int (*drm_init)(struct tegra_drm_client *client,
-			struct drm_device *drm);
-	int (*drm_exit)(struct tegra_drm_client *client);
+	int (*drm_init)(struct tegra_drm_client *, struct drm_device *);
+	int (*drm_exit)(struct tegra_drm_client *);
+	int (*open_channel)(struct tegra_drm_client *,
+			struct tegra_drm_context *);
+	void (*close_channel)(struct tegra_drm_context *);
+	u32 (*get_syncpoints)(struct tegra_drm_context *);
+	u32 (*get_waitbases)(struct tegra_drm_context *);
+	u32 (*get_modmutexes)(struct tegra_drm_context *);
+	int (*submit)(struct tegra_drm_context *,
+			struct tegra_drm_submit_args *);
+};
+
+
+struct tegra_drm_fpriv {
+	struct list_head contexts;
 };
 
+static inline struct tegra_drm_fpriv *
+tegra_drm_fpriv(struct drm_file *file_priv)
+{
+	return file_priv ? file_priv->driver_priv : NULL;
+}
+
 struct tegra_drm_client {
 	struct device *dev;
 
 	const struct tegra_drm_client_ops *ops;
 
+	u32 class;
+	struct nvhost_channel *channel;
+
 	struct list_head list;
 
 };
@@ -116,13 +144,6 @@  struct tegra_output_ops {
 			  enum drm_mode_status *status);
 };
 
-enum tegra_output_type {
-	TEGRA_OUTPUT_RGB,
-	TEGRA_OUTPUT_HDMI,
-	TEGRA_OUTPUT_TVO,
-	TEGRA_OUTPUT_DSI,
-};
-
 struct tegra_output {
 	struct device_node *of_node;
 	struct device *dev;
@@ -225,6 +246,7 @@  extern struct platform_driver tegra_hdmi_driver;
 extern struct platform_driver tegra_tvo_driver;
 extern struct platform_driver tegra_dsi_driver;
 extern struct platform_driver tegra_dc_driver;
+extern struct platform_driver tegra_gr2d_driver;
 extern struct drm_driver tegra_drm_driver;
 
 /* from dmabuf.c */
diff --git a/drivers/gpu/drm/tegra/gr2d.c b/drivers/gpu/drm/tegra/gr2d.c
new file mode 100644
index 0000000..51605af
--- /dev/null
+++ b/drivers/gpu/drm/tegra/gr2d.c
@@ -0,0 +1,224 @@ 
+/*
+ * drivers/video/tegra/host/gr2d/gr2d.c
+ *
+ * Tegra Graphics 2D
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/export.h>
+#include <linux/of.h>
+#include <drm/tegra_drm.h>
+#include <linux/nvhost.h>
+#include "drm.h"
+
+static struct tegra_drm_client gr2d_client;
+
+static int gr2d_client_init(struct tegra_drm_client *client,
+		struct drm_device *drm)
+{
+	return 0;
+}
+
+static int gr2d_client_exit(struct tegra_drm_client *client)
+{
+	return 0;
+}
+
+static int gr2d_open_channel(struct tegra_drm_client *client,
+		struct tegra_drm_context *context)
+{
+	struct nvhost_device_data *pdata = dev_get_drvdata(client->dev);
+	context->channel = nvhost_getchannel(pdata->channel);
+
+	if (!context->channel)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void gr2d_close_channel(struct tegra_drm_context *context)
+{
+	nvhost_putchannel(context->channel);
+}
+
+static u32 gr2d_get_syncpoints(struct tegra_drm_context *context)
+{
+	struct nvhost_device_data *pdata =
+		dev_get_drvdata(context->client->dev);
+	return pdata->syncpts;
+}
+
+static u32 gr2d_get_modmutexes(struct tegra_drm_context *context)
+{
+	struct nvhost_device_data *pdata =
+		dev_get_drvdata(context->client->dev);
+	return pdata->modulemutexes;
+}
+
+static int gr2d_submit(struct tegra_drm_context *context,
+		struct tegra_drm_submit_args *args)
+{
+	struct nvhost_job *job;
+	int num_cmdbufs = args->num_cmdbufs;
+	int num_relocs = args->num_relocs;
+	int num_waitchks = args->num_waitchks;
+	struct tegra_drm_cmdbuf __user *cmdbufs = args->cmdbufs;
+	struct tegra_drm_reloc __user *relocs = args->relocs;
+	struct tegra_drm_waitchk __user *waitchks = args->waitchks;
+	struct tegra_drm_syncpt_incr syncpt_incr;
+	int err;
+
+	dev_dbg(context->client->dev, "> %s(context=%p, cmdbufs=%d, relocs=%d, waitchks=%d)\n",
+			__func__, context,
+			num_cmdbufs, num_relocs, num_waitchks);
+
+	/* We don't yet support other than one syncpt_incr struct per submit */
+	if (args->num_syncpt_incrs != 1)
+		return -EINVAL;
+
+	job = nvhost_job_alloc(context->channel,
+			args->num_cmdbufs,
+			args->num_relocs,
+			args->num_waitchks);
+	if (!job)
+		return -ENOMEM;
+
+	job->num_relocs = args->num_relocs;
+	job->num_waitchk = args->num_waitchks;
+	job->clientid = (u32)args->context;
+
+	while (num_cmdbufs) {
+		struct tegra_drm_cmdbuf cmdbuf;
+		err = copy_from_user(&cmdbuf, cmdbufs, sizeof(cmdbuf));
+		if (err)
+			goto fail;
+		dev_dbg(context->client->dev, "cmdbuf: mem=%08x, words=%d, offset=%d\n",
+			cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
+		nvhost_job_add_gather(job,
+				cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
+		num_cmdbufs--;
+		cmdbufs++;
+	}
+
+	err = copy_from_user(job->relocarray,
+			relocs, sizeof(*relocs) * num_relocs);
+	if (err)
+		goto fail;
+
+	err = copy_from_user(job->waitchk,
+			relocs, sizeof(*waitchks) * num_waitchks);
+	if (err)
+		goto fail;
+
+	err = nvhost_job_pin(job, to_platform_device(context->client->dev));
+	if (err)
+		goto fail;
+
+	err = copy_from_user(&syncpt_incr,
+			args->syncpt_incrs, sizeof(syncpt_incr));
+	if (err)
+		goto fail;
+
+	job->syncpt_id = syncpt_incr.syncpt_id;
+	job->syncpt_incrs = syncpt_incr.syncpt_incrs;
+	job->timeout = 10000;
+	if (args->timeout && args->timeout < 10000)
+		job->timeout = args->timeout;
+
+	err = nvhost_channel_submit(job);
+	if (err)
+		goto fail_submit;
+
+	args->fence = job->syncpt_end;
+
+	nvhost_job_put(job);
+	dev_dbg(context->client->dev, "< %s(context=%p)\n", __func__, context);
+	return 0;
+
+fail_submit:
+	nvhost_job_unpin(job);
+fail:
+	nvhost_job_put(job);
+	dev_dbg(context->client->dev,
+			"< %s(context=%p) = %d\n", __func__, context, err);
+	return err;
+}
+
+static struct tegra_drm_client_ops gr2d_client_ops = {
+	.drm_init = gr2d_client_init,
+	.drm_exit = gr2d_client_exit,
+	.open_channel = gr2d_open_channel,
+	.close_channel = gr2d_close_channel,
+	.get_syncpoints = gr2d_get_syncpoints,
+	.get_modmutexes = gr2d_get_modmutexes,
+	.submit = gr2d_submit,
+};
+
+static int __devinit gr2d_probe(struct platform_device *dev)
+{
+	int err;
+	struct nvhost_device_data *pdata =
+		(struct nvhost_device_data *)dev->dev.platform_data;
+	pdata->pdev = dev;
+	platform_set_drvdata(dev, pdata);
+	err = nvhost_client_device_init(dev);
+	if (err)
+		return err;
+
+	gr2d_client.ops = &gr2d_client_ops;
+	gr2d_client.dev = &dev->dev;
+	gr2d_client.class = NV_GRAPHICS_2D_CLASS_ID;
+	return tegra_drm_register_client(&gr2d_client);
+}
+
+static int __exit gr2d_remove(struct platform_device *dev)
+{
+	/* Add clean-up */
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int gr2d_suspend(struct platform_device *dev, pm_message_t state)
+{
+	return nvhost_client_device_suspend(dev);
+}
+
+static int gr2d_resume(struct platform_device *dev)
+{
+	dev_info(&dev->dev, "resuming\n");
+	return 0;
+}
+#endif
+
+static struct of_device_id gr2d_match[] __devinitdata = {
+	{ .compatible = "nvidia,tegra20-gr2d", },
+	{ .compatible = "nvidia,tegra30-gr2d", },
+	{ },
+};
+
+struct platform_driver tegra_gr2d_driver = {
+	.probe = gr2d_probe,
+	.remove = __exit_p(gr2d_remove),
+#ifdef CONFIG_PM
+	.suspend = gr2d_suspend,
+	.resume = gr2d_resume,
+#endif
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "tegra-gr2d",
+		.of_match_table = of_match_ptr(gr2d_match),
+	}
+};
diff --git a/include/drm/tegra_drm.h b/include/drm/tegra_drm.h
new file mode 100644
index 0000000..bfc54d8
--- /dev/null
+++ b/include/drm/tegra_drm.h
@@ -0,0 +1,129 @@ 
+/*
+ * Copyright (c) 2012, Avionic Design GmbH
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _TEGRA_DRM_H_
+#define _TEGRA_DRM_H_
+
+enum tegra_output_type {
+	TEGRA_OUTPUT_RGB,
+	TEGRA_OUTPUT_HDMI,
+	TEGRA_OUTPUT_TVO,
+	TEGRA_OUTPUT_DSI,
+};
+
+struct tegra_gem_create {
+	uint64_t size;
+	unsigned int flags;
+	unsigned int handle;
+};
+
+#define DRM_TEGRA_GEM_CREATE		0x00
+
+#define DRM_IOCTL_TEGRA_GEM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + \
+		DRM_TEGRA_GEM_CREATE, struct tegra_gem_create)
+
+struct tegra_drm_syncpt_read_args {
+	__u32 id;
+	__u32 value;
+};
+
+struct tegra_drm_syncpt_incr_args {
+	__u32 id;
+};
+
+struct tegra_drm_syncpt_wait_args {
+	__u32 id;
+	__u32 thresh;
+	__s32 timeout;
+	__u32 value;
+};
+
+#define DRM_TEGRA_NO_TIMEOUT	(-1)
+
+struct tegra_drm_open_channel_args {
+	__u32 class;
+	void *context;
+};
+
+struct tegra_drm_get_channel_param_args {
+	void *context;
+	__u32 value;
+};
+
+struct tegra_drm_syncpt_incr {
+	__u32 syncpt_id;
+	__u32 syncpt_incrs;
+};
+
+struct tegra_drm_cmdbuf {
+	__u32 mem;
+	__u32 offset;
+	__u32 words;
+};
+
+struct tegra_drm_reloc {
+	__u32 cmdbuf_mem;
+	__u32 cmdbuf_offset;
+	__u32 target;
+	__u32 target_offset;
+	__u32 shift;
+};
+
+struct tegra_drm_waitchk {
+	__u32 mem;
+	__u32 offset;
+	__u32 syncpt_id;
+	__u32 thresh;
+};
+
+struct tegra_drm_submit_args {
+	void *context;
+	__u32 num_syncpt_incrs;
+	__u32 num_cmdbufs;
+	__u32 num_relocs;
+	__u32 submit_version;
+	__u32 num_waitchks;
+	__u32 waitchk_mask;
+	__u32 timeout;
+	struct tegra_drm_syncpt_incrs *syncpt_incrs;
+	struct tegra_drm_cmdbuf *cmdbufs;
+	struct tegra_drm_reloc *relocs;
+	struct tegra_drm_waitchk *waitchks;
+
+	__u32 pad[5];		/* future expansion */
+	__u32 fence;		/* Return value */
+};
+
+#define DRM_TEGRA_DRM_SYNCPT_READ 0x01
+#define DRM_TEGRA_DRM_SYNCPT_INCR 0x02
+#define DRM_TEGRA_DRM_SYNCPT_WAIT 0x03
+#define DRM_TEGRA_DRM_OPEN_CHANNEL 0x04
+#define DRM_TEGRA_DRM_CLOSE_CHANNEL 0x05
+#define DRM_TEGRA_DRM_GET_SYNCPOINTS 0x06
+#define DRM_TEGRA_DRM_GET_MODMUTEXES 0x07
+#define DRM_TEGRA_DRM_SUBMIT 0x08
+
+#define DRM_IOCTL_TEGRA_DRM_SYNCPT_READ DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_READ, struct tegra_drm_syncpt_read_args)
+#define DRM_IOCTL_TEGRA_DRM_SYNCPT_INCR DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_INCR, struct tegra_drm_syncpt_incr_args)
+#define DRM_IOCTL_TEGRA_DRM_SYNCPT_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SYNCPT_WAIT, struct tegra_drm_syncpt_wait_args)
+#define DRM_IOCTL_TEGRA_DRM_OPEN_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_OPEN_CHANNEL, struct tegra_drm_open_channel_args)
+#define DRM_IOCTL_TEGRA_DRM_CLOSE_CHANNEL DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_CLOSE_CHANNEL, struct tegra_drm_open_channel_args)
+#define DRM_IOCTL_TEGRA_DRM_GET_SYNCPOINTS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_GET_SYNCPOINTS, struct tegra_drm_get_channel_param_args)
+#define DRM_IOCTL_TEGRA_DRM_GET_MODMUTEXES DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_GET_MODMUTEXES, struct tegra_drm_get_channel_param_args)
+#define DRM_IOCTL_TEGRA_DRM_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_DRM_SUBMIT, struct tegra_drm_submit_args)
+
+#endif