diff mbox series

[v2,6/7] drm/panfrost: Add support for GPU heap allocations

Message ID 20190725011003.30837-7-robh@kernel.org (mailing list archive)
State New, archived
Headers show
Series drm/panfrost: Add heap and no execute buffer allocation | expand

Commit Message

Rob Herring July 25, 2019, 1:10 a.m. UTC
The midgard/bifrost GPUs need to allocate GPU heap memory which is
allocated on GPU page faults and not pinned in memory. The vendor driver
calls this functionality GROW_ON_GPF.

This implementation assumes that BOs allocated with the
PANFROST_BO_NOEXEC flag are never mmapped or exported. Both of those may
actually work, but I'm unsure if there's some interaction there. It
would cause the whole object to be pinned in memory which would defeat
the point of this.

On faults, we map in 2MB at a time in order to utilize huge pages (if
enabled). Currently, once we've mapped pages in, they are only unmapped
if the BO is freed. Once we add shrinker support, we can unmap pages
with the shrinker.

Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Boris Brezillon <boris.brezillon@collabora.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
v2:
- Stop leaking pages!
- Properly call dma_unmap_sg on cleanup
- Enforce PANFROST_BO_NOEXEC when PANFROST_BO_HEAP is set

 drivers/gpu/drm/panfrost/TODO           |   2 -
 drivers/gpu/drm/panfrost/panfrost_drv.c |   7 +-
 drivers/gpu/drm/panfrost/panfrost_gem.c |  23 +++-
 drivers/gpu/drm/panfrost/panfrost_gem.h |   8 ++
 drivers/gpu/drm/panfrost/panfrost_mmu.c | 134 ++++++++++++++++++++++--
 include/uapi/drm/panfrost_drm.h         |   1 +
 6 files changed, 159 insertions(+), 16 deletions(-)

--
2.20.1

Comments

Robin Murphy July 25, 2019, 1:08 p.m. UTC | #1
Hi Rob,

On 25/07/2019 02:10, Rob Herring wrote:
[...]
> @@ -328,6 +427,18 @@ static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
>   		access_type = (fault_status >> 8) & 0x3;
>   		source_id = (fault_status >> 16);
> 
> +		/* Page fault only */
> +		if ((status & mask) == BIT(i)) {
> +			WARN_ON(exception_type < 0xC1 || exception_type > 0xC4);
> +
> +			ret = panfrost_mmu_map_fault_addr(pfdev, i, addr);
> +			if (!ret) {
> +				mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
> +				status &= ~mask;
> +				continue;
> +			}
> +		}
> +
>   		/* terminal fault, print info about the fault */
>   		dev_err(pfdev->dev,
>   			"Unhandled Page fault in AS%d at VA 0x%016llX\n"
> @@ -368,8 +479,9 @@ int panfrost_mmu_init(struct panfrost_device *pfdev)
>   	if (irq <= 0)
>   		return -ENODEV;
> 
> -	err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
> -			       IRQF_SHARED, "mmu", pfdev);
> +	err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
> +					panfrost_mmu_irq_handler,
> +					IRQF_ONESHOT, "mmu", pfdev);

The change of flags here breaks platforms using a single shared 
interrupt line.

Otherwise, though, I've hacked around that and taken the branch for a 
spin with mesa/master (and using a 64K page kernel config for giggles) 
and nothing seems amiss to the extent of my "glmark2 runs all the way 
through" testing, but I guess there still need to be additional changes 
on the userspace end to actually exercise these new flags.

Robin.

> 
>   	if (err) {
>   		dev_err(pfdev->dev, "failed to request mmu irq");
> diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
> index 17fb5d200f7a..9150dd75aad8 100644
> --- a/include/uapi/drm/panfrost_drm.h
> +++ b/include/uapi/drm/panfrost_drm.h
> @@ -83,6 +83,7 @@ struct drm_panfrost_wait_bo {
>   };
> 
>   #define PANFROST_BO_NOEXEC	1
> +#define PANFROST_BO_HEAP	2
> 
>   /**
>    * struct drm_panfrost_create_bo - ioctl argument for creating Panfrost BOs.
> --
> 2.20.1
>
Steven Price July 25, 2019, 2:59 p.m. UTC | #2
On 25/07/2019 02:10, Rob Herring wrote:
> The midgard/bifrost GPUs need to allocate GPU heap memory which is
> allocated on GPU page faults and not pinned in memory. The vendor driver
> calls this functionality GROW_ON_GPF.
> 
> This implementation assumes that BOs allocated with the
> PANFROST_BO_NOEXEC flag are never mmapped or exported. Both of those may
> actually work, but I'm unsure if there's some interaction there. It
> would cause the whole object to be pinned in memory which would defeat
> the point of this.
> 
> On faults, we map in 2MB at a time in order to utilize huge pages (if
> enabled). Currently, once we've mapped pages in, they are only unmapped
> if the BO is freed. Once we add shrinker support, we can unmap pages
> with the shrinker.

I've taken this for a spin, unfortunately it falls over:

[  599.733909] Unable to handle kernel NULL pointer dereference at
virtual address 0000000000000000
[  599.742732] Mem abort info:
[  599.745526]   ESR = 0x96000046
[  599.748612]   Exception class = DABT (current EL), IL = 32 bits
[  599.754559]   SET = 0, FnV = 0
[  599.757612]   EA = 0, S1PTW = 0
[  599.760780] Data abort info:
[  599.763687]   ISV = 0, ISS = 0x00000046
[  599.767549]   CM = 0, WnR = 1
[  599.770546] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000af36d000
[  599.777017] [0000000000000000] pgd=00000000af260003,
pud=00000000af269003, pmd=0000000000000000
[  599.785782] Internal error: Oops: 96000046 [#1] SMP
[  599.790667] Modules linked in: panfrost gpu_sched
[  599.795390] CPU: 0 PID: 1007 Comm: irq/67-mmu Tainted: G S
    5.3.0-rc1-00355-g8c4e5073a168-dirty #35
[  599.805653] Hardware name: HiKey960 (DT)
[  599.809577] pstate: 60000005 (nZCv daif -PAN -UAO)
[  599.814384] pc : __sg_alloc_table+0x14/0x168
[  599.818654] lr : sg_alloc_table+0x2c/0x60
[  599.822660] sp : ffff000011bcbba0
[  599.825973] x29: ffff000011bcbba0 x28: 0000000000000000
[  599.831289] x27: ffff8000a87081e0 x26: 0000000000000000
[  599.836603] x25: 0000000000000080 x24: 0000000000200000
[  599.841917] x23: 0000000000000000 x22: 0000000000000000
[  599.847231] x21: 0000000000000200 x20: 0000000000000000
[  599.852546] x19: 00000000fffff000 x18: 0000000000000010
[  599.857860] x17: 0000000000000000 x16: 0000000000000000
[  599.863175] x15: ffffffffffffffff x14: 3030303030303030
[  599.868489] x13: 3030303030302873 x12: 656761705f6d6f72
[  599.873805] x11: 665f656c6261745f x10: 0000000000040000
[  599.879118] x9 : 0000000000000000 x8 : ffff7e0000000000
[  599.884434] x7 : ffff8000a870cff8 x6 : ffff0000104277e8
[  599.889748] x5 : 0000000000000cc0 x4 : 0000000000000000
[  599.895061] x3 : 0000000000000000 x2 : 0000000000000080
[  599.900375] x1 : 0000000000000008 x0 : 0000000000000000
[  599.905692] Call trace:
[  599.908143]  __sg_alloc_table+0x14/0x168
[  599.912067]  sg_alloc_table+0x2c/0x60
[  599.915732]  __sg_alloc_table_from_pages+0xd8/0x208
[  599.920612]  sg_alloc_table_from_pages+0x14/0x20
[  599.925252]  panfrost_mmu_map_fault_addr+0x288/0x368 [panfrost]
[  599.931185]  panfrost_mmu_irq_handler+0x134/0x298 [panfrost]
[  599.936855]  irq_thread_fn+0x28/0x78
[  599.940435]  irq_thread+0x124/0x1f8
[  599.943931]  kthread+0x120/0x128
[  599.947166]  ret_from_fork+0x10/0x18
[  599.950753] Code: 7100009f 910003fd a9046bf9 1a821099 (a9007c1f)
[  599.956854] ---[ end trace d99c6028af6bbbd8 ]---

It would appear that in the following call sgt==NULL:
> 	ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
> 					NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);

Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
and bo->is_heap=true. My understanding is this should be impossible.

I haven't yet figured out how this happens - it seems to be just before
termination, so it might be a race with cleanup?

> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> Cc: Boris Brezillon <boris.brezillon@collabora.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Steven Price <steven.price@arm.com>
> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
> Signed-off-by: Rob Herring <robh@kernel.org>
> ---
> v2:
> - Stop leaking pages!
> - Properly call dma_unmap_sg on cleanup
> - Enforce PANFROST_BO_NOEXEC when PANFROST_BO_HEAP is set
> 
>  drivers/gpu/drm/panfrost/TODO           |   2 -
>  drivers/gpu/drm/panfrost/panfrost_drv.c |   7 +-
>  drivers/gpu/drm/panfrost/panfrost_gem.c |  23 +++-
>  drivers/gpu/drm/panfrost/panfrost_gem.h |   8 ++
>  drivers/gpu/drm/panfrost/panfrost_mmu.c | 134 ++++++++++++++++++++++--
>  include/uapi/drm/panfrost_drm.h         |   1 +
>  6 files changed, 159 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/TODO b/drivers/gpu/drm/panfrost/TODO
> index c2e44add37d8..64129bf73933 100644
> --- a/drivers/gpu/drm/panfrost/TODO
> +++ b/drivers/gpu/drm/panfrost/TODO
> @@ -14,8 +14,6 @@
>    The hard part is handling when more address spaces are needed than what
>    the h/w provides.
> 
> -- Support pinning pages on demand (GPU page faults).
> -
>  - Support userspace controlled GPU virtual addresses. Needed for Vulkan. (Tomeu)
> 
>  - Support for madvise and a shrinker.
> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
> index 7ebd82d8d570..46a6bec7a0f2 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
> @@ -50,7 +50,12 @@ static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data,
>  	struct drm_panfrost_create_bo *args = data;
> 
>  	if (!args->size || args->pad ||
> -	    (args->flags & ~PANFROST_BO_NOEXEC))
> +	    (args->flags & ~(PANFROST_BO_NOEXEC | PANFROST_BO_HEAP)))
> +		return -EINVAL;
> +
> +	/* Heaps should never be executable */
> +	if ((args->flags & PANFROST_BO_HEAP) &&
> +	    !(args->flags & PANFROST_BO_NOEXEC))
>  		return -EINVAL;
> 
>  	bo = panfrost_gem_create_with_handle(file, dev, args->size, args->flags,
> diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c
> index 63731f6c5223..1237fb531321 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_gem.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c
> @@ -27,6 +27,17 @@ static void panfrost_gem_free_object(struct drm_gem_object *obj)
>  		drm_mm_remove_node(&bo->node);
>  	spin_unlock(&pfdev->mm_lock);
> 
> +	if (bo->sgts) {
> +		int i;
> +		int n_sgt = bo->base.base.size / SZ_2M;
> +
> +		for (i = 0; i < n_sgt; i++) {
> +			if (bo->sgts[i].sgl)
> +				dma_unmap_sg(pfdev->dev, bo->sgts[i].sgl,
> +					     bo->sgts[i].nents, DMA_BIDIRECTIONAL);
> +		}
> +	}
> +

Here you're not freeing the memory for bo->sgts itself. I think there's
a missing "kfree(bo->sgts)".

Steve
Steven Price July 25, 2019, 3:35 p.m. UTC | #3
On 25/07/2019 15:59, Steven Price wrote:
[...]
> It would appear that in the following call sgt==NULL:
>> 	ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
>> 					NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
> 
> Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
> and bo->is_heap=true. My understanding is this should be impossible.
> 
> I haven't yet figured out how this happens - it seems to be just before
> termination, so it might be a race with cleanup?

That was a red herring - it's partly my test case doing something a bit
weird. This crash is caused by doing an mmap of a HEAP object before any
fault has occurred.

drm_gem_shmem_mmap() calls drm_gem_shmem_get_pages() which will populate
bo->base.pages (even if bo->is_heap).

Either we should prevent mapping of HEAP objects, or alternatively
bo->base.pages could be allocated upfront instead of during the first
fault. My preference would be allocating it upfront because optimising
for the case of a HEAP BO which isn't used seems a bit weird. Although
there's still the question of exactly what the behaviour should be of
accessing through the CPU pages which haven't been allocated yet.

Also shmem->pages_use_count needs incrementing to stop
drm_gem_shmem_get_pages() replacing bo->base.pages. I haven't tested
what happens if you mmap *after* the first fault.

Steve
Alyssa Rosenzweig July 25, 2019, 4:13 p.m. UTC | #4
> Either we should prevent mapping of HEAP objects

I'm good with that. AFAIK, HEAP objects shouldn't be (CPU) mmapped
anyway in normal use; if you need them mapped (for debugging etc), it's
no big deal to just.. not set the HEAP flag in debug builds.

Or do you mean GPU mapping?
Steven Price July 25, 2019, 4:28 p.m. UTC | #5
On 25/07/2019 17:13, Alyssa Rosenzweig wrote:
>> Either we should prevent mapping of HEAP objects
> 
> I'm good with that. AFAIK, HEAP objects shouldn't be (CPU) mmapped
> anyway in normal use; if you need them mapped (for debugging etc), it's
> no big deal to just.. not set the HEAP flag in debug builds.
> 
> Or do you mean GPU mapping?

Sorry, I was being sloppy again![1] I meant CPU mmapped. I agree there
isn't a strong use case for it.

I've been investigating/testing Panfrost kernel with the Arm Mali blob.
Apparently the blob in some cases creates a SAME_VA GROW_ON_GPF buffer -
since SAME_VA means permanently mapped on the CPU this translated to
mmapping a HEAP object. Why it does this I've no idea.

So I've got an interest in trying to maintain compatibility. kbase
places no restriction on mmapping buffers. The main use in the blob for
this is being able to dump buffers when debugging (i.e. dump buffers
before/after every GPU job). Ideally you also need a way of querying
which pages have been backed by faults (much easier with kbase where
that's always just the number of pages).

Steve

[1] kbase+the blob have different terminology here to Panfrost, I do
sometimes struggle with the idea of "not mapped on the GPU" - it's not
really a concept in kbase. All buffers are "mapped" - they just might be
"growable" and not yet full size... :) Hence "mapped" refers to the CPU.
Alyssa Rosenzweig July 25, 2019, 5:40 p.m. UTC | #6
> Sorry, I was being sloppy again![1] I meant CPU mmapped. 

No worries, just wanted to check :)

> Apparently the blob in some cases creates a SAME_VA GROW_ON_GPF buffer -
> since SAME_VA means permanently mapped on the CPU this translated to
> mmapping a HEAP object. Why it does this I've no idea.

I'm not sure I follow. Conceptually, if you're permanently mapped,
there's nothing to grow, right? Is there a reason not to just disable
HEAP in this cases, i.e.:

	if (flags & SAME_VA)
		flags &= ~GROW_ON_GPF;

It may not be fully optimal, but that way the legacy code keeps working
and upstream userspace isn't held back :)

> The main use in the blob for
> this is being able to dump buffers when debugging (i.e. dump buffers
> before/after every GPU job). 

Could we disable HEAP support in userspace (not setting the flags) for
debug builds that need to dump buffers? In production the extra memory
usage matters, hence this patch, but in dev, there's plenty of memory to
spare.

> Ideally you also need a way of querying which pages have been backed
> by faults (much easier with kbase where that's always just the number
> of pages).

Is there a use case for this with one of the userland APIs? (Maybe
Vulkan?)
Rob Herring July 25, 2019, 9:11 p.m. UTC | #7
On Thu, Jul 25, 2019 at 7:08 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> Hi Rob,
>
> On 25/07/2019 02:10, Rob Herring wrote:
> [...]
> > @@ -328,6 +427,18 @@ static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
> >               access_type = (fault_status >> 8) & 0x3;
> >               source_id = (fault_status >> 16);
> >
> > +             /* Page fault only */
> > +             if ((status & mask) == BIT(i)) {
> > +                     WARN_ON(exception_type < 0xC1 || exception_type > 0xC4);
> > +
> > +                     ret = panfrost_mmu_map_fault_addr(pfdev, i, addr);
> > +                     if (!ret) {
> > +                             mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
> > +                             status &= ~mask;
> > +                             continue;
> > +                     }
> > +             }
> > +
> >               /* terminal fault, print info about the fault */
> >               dev_err(pfdev->dev,
> >                       "Unhandled Page fault in AS%d at VA 0x%016llX\n"
> > @@ -368,8 +479,9 @@ int panfrost_mmu_init(struct panfrost_device *pfdev)
> >       if (irq <= 0)
> >               return -ENODEV;
> >
> > -     err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
> > -                            IRQF_SHARED, "mmu", pfdev);
> > +     err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
> > +                                     panfrost_mmu_irq_handler,
> > +                                     IRQF_ONESHOT, "mmu", pfdev);
>
> The change of flags here breaks platforms using a single shared
> interrupt line.

Do they exist? I think this was largely copy-n-paste leftover from the
lima driver where utgard has a bunch of irqs and so they get combined.
While it's possible certainly, I'd like to avoid having to do further
rework either to use a workqueue or we need a single driver handler
which then dispatches the sub handlers. The problem is threaded irq
handlers don't work with shared irqs.

Rob
Rob Herring July 25, 2019, 9:28 p.m. UTC | #8
On Thu, Jul 25, 2019 at 9:35 AM Steven Price <steven.price@arm.com> wrote:
>
> On 25/07/2019 15:59, Steven Price wrote:
> [...]
> > It would appear that in the following call sgt==NULL:
> >>      ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
> >>                                      NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
> >
> > Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
> > and bo->is_heap=true. My understanding is this should be impossible.
> >
> > I haven't yet figured out how this happens - it seems to be just before
> > termination, so it might be a race with cleanup?
>
> That was a red herring - it's partly my test case doing something a bit
> weird. This crash is caused by doing an mmap of a HEAP object before any
> fault has occurred.

My brother Red is always causing problems. ;)

But you were right that I need to kfree bo->sgts. Additionally, I also
need to call sg_free_table on each table.

> drm_gem_shmem_mmap() calls drm_gem_shmem_get_pages() which will populate
> bo->base.pages (even if bo->is_heap).
>
> Either we should prevent mapping of HEAP objects, or alternatively
> bo->base.pages could be allocated upfront instead of during the first
> fault. My preference would be allocating it upfront because optimising
> for the case of a HEAP BO which isn't used seems a bit weird. Although
> there's still the question of exactly what the behaviour should be of
> accessing through the CPU pages which haven't been allocated yet.
>
> Also shmem->pages_use_count needs incrementing to stop
> drm_gem_shmem_get_pages() replacing bo->base.pages. I haven't tested
> what happens if you mmap *after* the first fault.

I did say mmap had undefined/unknown behavior.

Rob
Steven Price July 26, 2019, 9:15 a.m. UTC | #9
On 25/07/2019 22:11, Rob Herring wrote:
> On Thu, Jul 25, 2019 at 7:08 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> Hi Rob,
>>
>> On 25/07/2019 02:10, Rob Herring wrote:
>> [...]
>>> @@ -328,6 +427,18 @@ static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
>>>                access_type = (fault_status >> 8) & 0x3;
>>>                source_id = (fault_status >> 16);
>>>
>>> +             /* Page fault only */
>>> +             if ((status & mask) == BIT(i)) {
>>> +                     WARN_ON(exception_type < 0xC1 || exception_type > 0xC4);
>>> +
>>> +                     ret = panfrost_mmu_map_fault_addr(pfdev, i, addr);
>>> +                     if (!ret) {
>>> +                             mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
>>> +                             status &= ~mask;
>>> +                             continue;
>>> +                     }
>>> +             }
>>> +
>>>                /* terminal fault, print info about the fault */
>>>                dev_err(pfdev->dev,
>>>                        "Unhandled Page fault in AS%d at VA 0x%016llX\n"
>>> @@ -368,8 +479,9 @@ int panfrost_mmu_init(struct panfrost_device *pfdev)
>>>        if (irq <= 0)
>>>                return -ENODEV;
>>>
>>> -     err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
>>> -                            IRQF_SHARED, "mmu", pfdev);
>>> +     err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
>>> +                                     panfrost_mmu_irq_handler,
>>> +                                     IRQF_ONESHOT, "mmu", pfdev);
>>
>> The change of flags here breaks platforms using a single shared
>> interrupt line.
> 
> Do they exist? I think this was largely copy-n-paste leftover from the

Good question - of the platforms I know about they all have separate 
IRQs, but kbase does explicitly allow shared interrupts so it's quite 
possible there is a platform out there which does.

> lima driver where utgard has a bunch of irqs and so they get combined.
> While it's possible certainly, I'd like to avoid having to do further
> rework either to use a workqueue or we need a single driver handler
> which then dispatches the sub handlers. The problem is threaded irq
> handlers don't work with shared irqs.

It seems reasonable to me to at least wait until someone identifies a 
platform which needs shared interrupts before embarking on the refactoring.

Steve
Steven Price July 26, 2019, 9:20 a.m. UTC | #10
On 25/07/2019 22:28, Rob Herring wrote:
> On Thu, Jul 25, 2019 at 9:35 AM Steven Price <steven.price@arm.com> wrote:
>>
>> On 25/07/2019 15:59, Steven Price wrote:
>> [...]
>>> It would appear that in the following call sgt==NULL:
>>>>       ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
>>>>                                       NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
>>>
>>> Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
>>> and bo->is_heap=true. My understanding is this should be impossible.
>>>
>>> I haven't yet figured out how this happens - it seems to be just before
>>> termination, so it might be a race with cleanup?
>>
>> That was a red herring - it's partly my test case doing something a bit
>> weird. This crash is caused by doing an mmap of a HEAP object before any
>> fault has occurred.
> 
> My brother Red is always causing problems. ;)
> 
> But you were right that I need to kfree bo->sgts. Additionally, I also
> need to call sg_free_table on each table.
> 
>> drm_gem_shmem_mmap() calls drm_gem_shmem_get_pages() which will populate
>> bo->base.pages (even if bo->is_heap).
>>
>> Either we should prevent mapping of HEAP objects, or alternatively
>> bo->base.pages could be allocated upfront instead of during the first
>> fault. My preference would be allocating it upfront because optimising
>> for the case of a HEAP BO which isn't used seems a bit weird. Although
>> there's still the question of exactly what the behaviour should be of
>> accessing through the CPU pages which haven't been allocated yet.
>>
>> Also shmem->pages_use_count needs incrementing to stop
>> drm_gem_shmem_get_pages() replacing bo->base.pages. I haven't tested
>> what happens if you mmap *after* the first fault.
> 
> I did say mmap had undefined/unknown behavior.

True - and I was surprised to find out my test was actually doing that! 
But crashing the kernel is perhaps a bit excessive!

Avoiding the mmap of HEAP objects everything runs fine - and the memory 
leaks are much smaller than they used to be :)

Steve
Robin Murphy July 26, 2019, 9:32 a.m. UTC | #11
On 26/07/2019 10:15, Steven Price wrote:
> On 25/07/2019 22:11, Rob Herring wrote:
>> On Thu, Jul 25, 2019 at 7:08 AM Robin Murphy <robin.murphy@arm.com> 
>> wrote:
>>>
>>> Hi Rob,
>>>
>>> On 25/07/2019 02:10, Rob Herring wrote:
>>> [...]
>>>> @@ -328,6 +427,18 @@ static irqreturn_t panfrost_mmu_irq_handler(int 
>>>> irq, void *data)
>>>>                access_type = (fault_status >> 8) & 0x3;
>>>>                source_id = (fault_status >> 16);
>>>>
>>>> +             /* Page fault only */
>>>> +             if ((status & mask) == BIT(i)) {
>>>> +                     WARN_ON(exception_type < 0xC1 || 
>>>> exception_type > 0xC4);
>>>> +
>>>> +                     ret = panfrost_mmu_map_fault_addr(pfdev, i, 
>>>> addr);
>>>> +                     if (!ret) {
>>>> +                             mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
>>>> +                             status &= ~mask;
>>>> +                             continue;
>>>> +                     }
>>>> +             }
>>>> +
>>>>                /* terminal fault, print info about the fault */
>>>>                dev_err(pfdev->dev,
>>>>                        "Unhandled Page fault in AS%d at VA 0x%016llX\n"
>>>> @@ -368,8 +479,9 @@ int panfrost_mmu_init(struct panfrost_device 
>>>> *pfdev)
>>>>        if (irq <= 0)
>>>>                return -ENODEV;
>>>>
>>>> -     err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
>>>> -                            IRQF_SHARED, "mmu", pfdev);
>>>> +     err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
>>>> +                                     panfrost_mmu_irq_handler,
>>>> +                                     IRQF_ONESHOT, "mmu", pfdev);
>>>
>>> The change of flags here breaks platforms using a single shared
>>> interrupt line.
>>
>> Do they exist? I think this was largely copy-n-paste leftover from the
> 
> Good question - of the platforms I know about they all have separate 
> IRQs, but kbase does explicitly allow shared interrupts so it's quite 
> possible there is a platform out there which does.
> 
>> lima driver where utgard has a bunch of irqs and so they get combined.
>> While it's possible certainly, I'd like to avoid having to do further
>> rework either to use a workqueue or we need a single driver handler
>> which then dispatches the sub handlers. The problem is threaded irq
>> handlers don't work with shared irqs.
> 
> It seems reasonable to me to at least wait until someone identifies a 
> platform which needs shared interrupts before embarking on the refactoring.

I don't know about real silicon, but it's certainly true of our internal 
FPGA images that I've been playing with (the Juno board only has a 
single IRQ line for the entire FPGA site). That's obviously not a major 
priority for upstream, though, so I can hack around it if it's not 
otherwise justified.

Robin.
Steven Price July 26, 2019, 10:43 a.m. UTC | #12
On 25/07/2019 18:40, Alyssa Rosenzweig wrote:
>> Sorry, I was being sloppy again![1] I meant CPU mmapped.
> 
> No worries, just wanted to check :)
> 
>> Apparently the blob in some cases creates a SAME_VA GROW_ON_GPF buffer -
>> since SAME_VA means permanently mapped on the CPU this translated to
>> mmapping a HEAP object. Why it does this I've no idea.
> 
> I'm not sure I follow. Conceptually, if you're permanently mapped,
> there's nothing to grow, right? Is there a reason not to just disable
> HEAP in this cases, i.e.:
> 
> 	if (flags & SAME_VA)
> 		flags &= ~GROW_ON_GPF;
> 
> It may not be fully optimal, but that way the legacy code keeps working
> and upstream userspace isn't held back :)

Yes, that's my hack at the moment and it works. It looks like the driver 
might be allocated a depth or stencil buffer without knowing whether it 
will be used. The buffer is then "grown" if it is needed by the GPU. The 
problem is it is possible for the application to access it later.

>> The main use in the blob for
>> this is being able to dump buffers when debugging (i.e. dump buffers
>> before/after every GPU job).
> 
> Could we disable HEAP support in userspace (not setting the flags) for
> debug builds that need to dump buffers? In production the extra memory
> usage matters, hence this patch, but in dev, there's plenty of memory to
> spare.
> 
>> Ideally you also need a way of querying which pages have been backed
>> by faults (much easier with kbase where that's always just the number
>> of pages).
> 
> Is there a use case for this with one of the userland APIs? (Maybe
> Vulkan?)

I'm not aware of OpenGL(ES) APIs that expose functionality like this. 
But e.g. allocating a buffer ahead of time for depth/stencil "just in 
case" would need something like this because you may need CPU access to it.

Vulkan has the concept of "sparse" bindings/residency. As far as I'm 
aware there's no requirement that memory is allocated on demand, but a 
page-by-page approach to populating memory is expected. There's quite a 
bit of complexity and the actual way this is represented on the GPU 
doesn't necessarily match the user visible API. Also I believe it's an 
optional feature.

Panfrost, of course, doesn't yet have a good mechanism for supporting 
anything like SAME_VA. My hack so far is to keep allocating BOs until it 
happens to land at an address currently unused in user space.

OpenCL does require something like SAME_VA ("Shared Virtual Memory" or 
SVM). This is apparently useful because the same pointer can be used on 
both CPU and GPU.

I can see two approaches for integrating that:

* Use HMM: CPU VA==GPU VA. This nicely solves the problem, but falls 
over badly when the GPU VA size is smaller than the user space VA size - 
which is sadly true on many 64 bit integrations.

* Provide an allocation flag which causes the kernel driver to not pick 
a GPU address until the buffer is mapped on the CPU. The mmap callback 
would then need to look for a region that is free on both the CPU and GPU.

The second is obviously most similar to the kbase approach. kbase 
simplifies things because the kernel driver has the ultimate say over 
whether the buffer is SAME_VA or not. So on 64 bit user space we upgrade 
everything to be SAME_VA - which means the GPU VA space just follows the 
CPU VA (similar to HMM).

Steve
Alyssa Rosenzweig July 26, 2019, 1:57 p.m. UTC | #13
> It looks like the driver might be allocated a depth or stencil buffer
> without knowing whether it will be used. The buffer is then "grown" if
> it is needed by the GPU. The problem is it is possible for the
> application to access it later.

Hmm. I "think" you should be able to use a dummy (unbacked) Z/S buffer
when it won't be used, and as soon as the *driver* decides it will be
used (e.g. by setting the MALI_MFBD_DEPTH_WRITE bit), *that* is when you
allocate a real memory-backed BO. Neither case needs to be growable;
growable just pushes the logic into kernelspace (instead of handling it
in userspace).

The only wrinkle is if you need to give out addresses a priori, but that
could be solved by a mechanism to mmap a BO to a particular CPU address,
I think. (I recall MEM_ALIAS in kbase might be relevant?)

> * Use HMM: CPU VA==GPU VA. This nicely solves the problem, but falls over
> badly when the GPU VA size is smaller than the user space VA size - which is
> sadly true on many 64 bit integrations.
> 
> * Provide an allocation flag which causes the kernel driver to not pick a
> GPU address until the buffer is mapped on the CPU. The mmap callback would
> then need to look for a region that is free on both the CPU and GPU.
> 
> The second is obviously most similar to the kbase approach. kbase simplifies
> things because the kernel driver has the ultimate say over whether the
> buffer is SAME_VA or not. So on 64 bit user space we upgrade everything to
> be SAME_VA - which means the GPU VA space just follows the CPU VA (similar
> to HMM).

I'll let Rob chime in on this one. Thank you for the detailed write-up!

-Alyssa
Rob Herring July 26, 2019, 4:11 p.m. UTC | #14
On Fri, Jul 26, 2019 at 3:15 AM Steven Price <steven.price@arm.com> wrote:
>
> On 25/07/2019 22:11, Rob Herring wrote:
> > On Thu, Jul 25, 2019 at 7:08 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >>
> >> Hi Rob,
> >>
> >> On 25/07/2019 02:10, Rob Herring wrote:
> >> [...]
> >>> @@ -328,6 +427,18 @@ static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
> >>>                access_type = (fault_status >> 8) & 0x3;
> >>>                source_id = (fault_status >> 16);
> >>>
> >>> +             /* Page fault only */
> >>> +             if ((status & mask) == BIT(i)) {
> >>> +                     WARN_ON(exception_type < 0xC1 || exception_type > 0xC4);
> >>> +
> >>> +                     ret = panfrost_mmu_map_fault_addr(pfdev, i, addr);
> >>> +                     if (!ret) {
> >>> +                             mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
> >>> +                             status &= ~mask;
> >>> +                             continue;
> >>> +                     }
> >>> +             }
> >>> +
> >>>                /* terminal fault, print info about the fault */
> >>>                dev_err(pfdev->dev,
> >>>                        "Unhandled Page fault in AS%d at VA 0x%016llX\n"
> >>> @@ -368,8 +479,9 @@ int panfrost_mmu_init(struct panfrost_device *pfdev)
> >>>        if (irq <= 0)
> >>>                return -ENODEV;
> >>>
> >>> -     err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
> >>> -                            IRQF_SHARED, "mmu", pfdev);
> >>> +     err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
> >>> +                                     panfrost_mmu_irq_handler,
> >>> +                                     IRQF_ONESHOT, "mmu", pfdev);
> >>
> >> The change of flags here breaks platforms using a single shared
> >> interrupt line.
> >
> > Do they exist? I think this was largely copy-n-paste leftover from the
>
> Good question - of the platforms I know about they all have separate
> IRQs, but kbase does explicitly allow shared interrupts so it's quite
> possible there is a platform out there which does.
>
> > lima driver where utgard has a bunch of irqs and so they get combined.
> > While it's possible certainly, I'd like to avoid having to do further
> > rework either to use a workqueue or we need a single driver handler
> > which then dispatches the sub handlers. The problem is threaded irq
> > handlers don't work with shared irqs.
>
> It seems reasonable to me to at least wait until someone identifies a
> platform which needs shared interrupts before embarking on the refactoring.

And now someone has on irc...

It may not be as much rework as I thought. We have to set IRQF_ONESHOT
and the requirement is all shared handlers' flags have to match. The
GPU and job ISRs are not critical paths, so they can be threaded too.
I'll test this out.

Rob
Rob Herring July 30, 2019, 6:49 p.m. UTC | #15
On Mon, Jul 29, 2019 at 1:18 AM Steven Price <steven.price@arm.com> wrote:
>
> On 25/07/2019 18:40, Alyssa Rosenzweig wrote:
> >> Sorry, I was being sloppy again![1] I meant CPU mmapped.
> >
> > No worries, just wanted to check :)
> >
> >> Apparently the blob in some cases creates a SAME_VA GROW_ON_GPF buffer -
> >> since SAME_VA means permanently mapped on the CPU this translated to
> >> mmapping a HEAP object. Why it does this I've no idea.
> >
> > I'm not sure I follow. Conceptually, if you're permanently mapped,
> > there's nothing to grow, right? Is there a reason not to just disable
> > HEAP in this cases, i.e.:
> >
> >       if (flags & SAME_VA)
> >               flags &= ~GROW_ON_GPF;
> >
> > It may not be fully optimal, but that way the legacy code keeps working
> > and upstream userspace isn't held back :)
>
> Yes, that's my hack at the moment and it works. It looks like the driver
> might be allocated a depth or stencil buffer without knowing whether it
> will be used. The buffer is then "grown" if it is needed by the GPU. The
> problem is it is possible for the application to access it later.
>
> >> The main use in the blob for
> >> this is being able to dump buffers when debugging (i.e. dump buffers
> >> before/after every GPU job).
> >
> > Could we disable HEAP support in userspace (not setting the flags) for
> > debug builds that need to dump buffers? In production the extra memory
> > usage matters, hence this patch, but in dev, there's plenty of memory to
> > spare.
> >
> >> Ideally you also need a way of querying which pages have been backed
> >> by faults (much easier with kbase where that's always just the number
> >> of pages).
> >
> > Is there a use case for this with one of the userland APIs? (Maybe
> > Vulkan?)
>
> I'm not aware of OpenGL(ES) APIs that expose functionality like this.
> But e.g. allocating a buffer ahead of time for depth/stencil "just in
> case" would need something like this because you may need CPU access to it.
>
> Vulkan has the concept of "sparse" bindings/residency. As far as I'm
> aware there's no requirement that memory is allocated on demand, but a
> page-by-page approach to populating memory is expected. There's quite a
> bit of complexity and the actual way this is represented on the GPU
> doesn't necessarily match the user visible API. Also I believe it's an
> optional feature.
>
> Panfrost, of course, doesn't yet have a good mechanism for supporting
> anything like SAME_VA. My hack so far is to keep allocating BOs until it
> happens to land at an address currently unused in user space.
>
> OpenCL does require something like SAME_VA ("Shared Virtual Memory" or
> SVM). This is apparently useful because the same pointer can be used on
> both CPU and GPU.
>
> I can see two approaches for integrating that:
>
> * Use HMM: CPU VA==GPU VA. This nicely solves the problem, but falls
> over badly when the GPU VA size is smaller than the user space VA size -
> which is sadly true on many 64 bit integrations.

If mmap limits CPU addresses to the GPU VA size to start with,
wouldn't that solve the problem (at least to the extent it is
solvable).

From a brief read, while HMM supports page table mirroring, it seems
more geared towards supporting discreet graphics memory. It also
mentions that it avoids pinning all pages in memory, but that's kind
of an assumption of GEM objects (I'm kind of working against that with
the heap support). Or at least all the common GEM helpers work that
way.

> * Provide an allocation flag which causes the kernel driver to not pick
> a GPU address until the buffer is mapped on the CPU. The mmap callback
> would then need to look for a region that is free on both the CPU and GPU.

Using mmap seems like a neat solution compared to how other drivers
handle this which is yet another ioctl to set the GPU VA. In those
cases, all the GPU VA space management is in userspace which seems
backwards to me. I'm not sure if there are any downsides to using mmap
though.

In any case, per process AS is a prerequisite to all this. That's
probably the bigger chunk of work and still lower priority than not
running out of memory. :)

Rob
Alyssa Rosenzweig July 30, 2019, 6:54 p.m. UTC | #16
> In any case, per process AS is a prerequisite to all this.

Oh, I hadn't realized that was still a todo. In the meantime, what's the
implication of shipping without it? (I.e. in which threat model are
users vulnerable without it?) Malicious userspace process snooping on
other framebuffers (on X11, they could do that anyway...)? Malicious
userspace actually interfering with operation of other processes (is
this really exploitable or just a theoretical concern)? Malicious 3D
apps breaking out of the sandbox (i.e. WebGL) via a driver bug and
snooping on other processes?

---

*wishing we could just delete webgl intensifies*
Rob Herring July 30, 2019, 7:08 p.m. UTC | #17
On Tue, Jul 30, 2019 at 12:55 PM Alyssa Rosenzweig
<alyssa.rosenzweig@collabora.com> wrote:
>
> > In any case, per process AS is a prerequisite to all this.
>
> Oh, I hadn't realized that was still a todo. In the meantime, what's the
> implication of shipping without it? (I.e. in which threat model are
> users vulnerable without it?) Malicious userspace process snooping on
> other framebuffers (on X11, they could do that anyway...)? Malicious
> userspace actually interfering with operation of other processes (is
> this really exploitable or just a theoretical concern)? Malicious 3D
> apps breaking out of the sandbox (i.e. WebGL) via a driver bug and
> snooping on other processes?

I don't know. However, it's not that uncommon. freedreno is only now
in the process of supporting it. vc4 can't. v3d doesn't yet support
separate address spaces.

Rob
Rob Herring July 30, 2019, 8:03 p.m. UTC | #18
On Thu, Jul 25, 2019 at 9:35 AM Steven Price <steven.price@arm.com> wrote:
>
> On 25/07/2019 15:59, Steven Price wrote:
> [...]
> > It would appear that in the following call sgt==NULL:
> >>      ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
> >>                                      NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
> >
> > Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
> > and bo->is_heap=true. My understanding is this should be impossible.
> >
> > I haven't yet figured out how this happens - it seems to be just before
> > termination, so it might be a race with cleanup?
>
> That was a red herring - it's partly my test case doing something a bit
> weird. This crash is caused by doing an mmap of a HEAP object before any
> fault has occurred.
>
> drm_gem_shmem_mmap() calls drm_gem_shmem_get_pages() which will populate
> bo->base.pages (even if bo->is_heap).
>
> Either we should prevent mapping of HEAP objects, or alternatively
> bo->base.pages could be allocated upfront instead of during the first
> fault. My preference would be allocating it upfront because optimising
> for the case of a HEAP BO which isn't used seems a bit weird. Although
> there's still the question of exactly what the behaviour should be of
> accessing through the CPU pages which haven't been allocated yet.

As preventing getting the mmap fake offset should be sufficient, I'm
planning on doing this change:

diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c
b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 746eb4603bc2..186d5db892a9 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -270,6 +270,10 @@ static int panfrost_ioctl_mmap_bo(struct
drm_device *dev, void *data,
                return -ENOENT;
        }

+       /* Don't allow mmapping of heap objects as pages are not pinned. */
+       if (to_panfrost_bo(gem_obj)->is_heap))
+               return -EINVAL;
+
        ret = drm_gem_create_mmap_offset(gem_obj);
        if (ret == 0)
                args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);


> Also shmem->pages_use_count needs incrementing to stop
> drm_gem_shmem_get_pages() replacing bo->base.pages. I haven't tested
> what happens if you mmap *after* the first fault.

I set pages_use_count to 1 when we allocate pages on the first fault.
Or do you mean we need to set it on creation in case
drm_gem_shmem_get_pages() is called before any GPU faults?

Either way, that just shifts how/where we crash I think. We need to
prevent drm_gem_shmem_get_pages() from being called. Besides mmap, the
other cases are vmap and exporting. I don't think we have any paths
that will cause vmap to get called in our case. For exporting, perhaps
we need a wrapper around drm_gem_shmem_pin() to prevent it.

Rob
Steven Price July 31, 2019, 9:22 a.m. UTC | #19
On 30/07/2019 20:08, Rob Herring wrote:
> On Tue, Jul 30, 2019 at 12:55 PM Alyssa Rosenzweig
> <alyssa.rosenzweig@collabora.com> wrote:
>>
>>> In any case, per process AS is a prerequisite to all this.
>>
>> Oh, I hadn't realized that was still a todo. In the meantime, what's the
>> implication of shipping without it? (I.e. in which threat model are
>> users vulnerable without it?) Malicious userspace process snooping on
>> other framebuffers (on X11, they could do that anyway...)? Malicious
>> userspace actually interfering with operation of other processes (is
>> this really exploitable or just a theoretical concern)? Malicious 3D
>> apps breaking out of the sandbox (i.e. WebGL) via a driver bug and
>> snooping on other processes?

Having a single address space means:

Malicious userspace can: snoop and overwrite graphics buffers from other
applications. I don't believe you can directly map the buffer into the
other application, but the GPU provides plenty of functionality to
implement a memcpy-like shader which can copy to/from buffers you don't own.

WebGL: In *theory* the driver should ensure that any buffer accesses are
contained before letting the shaders run. So this shouldn't actually
allow breaking out of the sandbox. This is because the browser may use
one process for multiple tabs (and one process = one AS in most cases)
and they should be sandboxed. But obviously it only requires one driver
bug for this to break as well.


Also note that if the driver "trusts" any of the data shared with the
GPU (e.g. follows pointers stored in GPU accessible memory or uses
values to look up in an array without bounds checking) then a malicious
userspace may be able to inject code into another process. So this could
be a privilege escalation route.

> I don't know. However, it's not that uncommon. freedreno is only now
> in the process of supporting it. vc4 can't. v3d doesn't yet support
> separate address spaces.

Indeed it doesn't seem to actually concern many people. For example
almost all GPUs allow any process to effectively DoS the GPU by
submitting extremely long running jobs. Most OSes just reset the GPU in
this case - which might take work from another process with it. Although
kbase actually does try fairly hard to prevent that.

Although somewhat amusingly, you might have noticed this lovely
workaround in kbase:

> 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8987)) {
> 		bool use_workaround;
> 
> 		use_workaround = DEFAULT_SECURE_BUT_LOSS_OF_PERFORMANCE;
> 		if (use_workaround) {
> 			dev_dbg(kbdev->dev, "GPU has HW ISSUE 8987, and driver configured for security workaround: 1 address space only");
> 			kbdev->nr_user_address_spaces = 1;
> 		}
> 	}

I seriously doubt anyone ever set
DEFAULT_SECURE_BUT_LOSS_OF_PERFORMANCE... But this was a hardware bug
that broke the isolation between address spaces.

Steve
Steven Price July 31, 2019, 9:30 a.m. UTC | #20
On 30/07/2019 21:03, Rob Herring wrote:
> On Thu, Jul 25, 2019 at 9:35 AM Steven Price <steven.price@arm.com> wrote:
>>
>> On 25/07/2019 15:59, Steven Price wrote:
>> [...]
>>> It would appear that in the following call sgt==NULL:
>>>>      ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
>>>>                                      NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
>>>
>>> Which means we've ended up with a BO with bo->sgt==NULL, bo->pages set
>>> and bo->is_heap=true. My understanding is this should be impossible.
>>>
>>> I haven't yet figured out how this happens - it seems to be just before
>>> termination, so it might be a race with cleanup?
>>
>> That was a red herring - it's partly my test case doing something a bit
>> weird. This crash is caused by doing an mmap of a HEAP object before any
>> fault has occurred.
>>
>> drm_gem_shmem_mmap() calls drm_gem_shmem_get_pages() which will populate
>> bo->base.pages (even if bo->is_heap).
>>
>> Either we should prevent mapping of HEAP objects, or alternatively
>> bo->base.pages could be allocated upfront instead of during the first
>> fault. My preference would be allocating it upfront because optimising
>> for the case of a HEAP BO which isn't used seems a bit weird. Although
>> there's still the question of exactly what the behaviour should be of
>> accessing through the CPU pages which haven't been allocated yet.
> 
> As preventing getting the mmap fake offset should be sufficient, I'm
> planning on doing this change:
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c
> b/drivers/gpu/drm/panfrost/panfrost_drv.c
> index 746eb4603bc2..186d5db892a9 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
> @@ -270,6 +270,10 @@ static int panfrost_ioctl_mmap_bo(struct
> drm_device *dev, void *data,
>                 return -ENOENT;
>         }
> 
> +       /* Don't allow mmapping of heap objects as pages are not pinned. */
> +       if (to_panfrost_bo(gem_obj)->is_heap))
> +               return -EINVAL;
> +
>         ret = drm_gem_create_mmap_offset(gem_obj);
>         if (ret == 0)
>                 args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);
> 

Seems reasonable to me - we can always add support for mmap()ing at a
later date if it becomes useful.

>> Also shmem->pages_use_count needs incrementing to stop
>> drm_gem_shmem_get_pages() replacing bo->base.pages. I haven't tested
>> what happens if you mmap *after* the first fault.
> 
> I set pages_use_count to 1 when we allocate pages on the first fault.
> Or do you mean we need to set it on creation in case
> drm_gem_shmem_get_pages() is called before any GPU faults?
> 
> Either way, that just shifts how/where we crash I think. We need to
> prevent drm_gem_shmem_get_pages() from being called. Besides mmap, the
> other cases are vmap and exporting. I don't think we have any paths
> that will cause vmap to get called in our case. For exporting, perhaps
> we need a wrapper around drm_gem_shmem_pin() to prevent it.

Yes, either every path to drm_gem_shmem_get_pages() needs to be blocked
for HEAP objects, or you need to set pages_use_count to 1 when you
allocate (which then means drm_gem_shmem_get_pages() will simply
increment the ref-count).

Of course if you are leaving any calls to drm_gem_shmem_get_pages()
reachable then you also need to ensure that the code that follows
understands how to deal with a sparse bo->pages array. Exporting would
be a good example - and again I suspect just preventing it is fine for now.

Steve
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panfrost/TODO b/drivers/gpu/drm/panfrost/TODO
index c2e44add37d8..64129bf73933 100644
--- a/drivers/gpu/drm/panfrost/TODO
+++ b/drivers/gpu/drm/panfrost/TODO
@@ -14,8 +14,6 @@ 
   The hard part is handling when more address spaces are needed than what
   the h/w provides.

-- Support pinning pages on demand (GPU page faults).
-
 - Support userspace controlled GPU virtual addresses. Needed for Vulkan. (Tomeu)

 - Support for madvise and a shrinker.
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 7ebd82d8d570..46a6bec7a0f2 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -50,7 +50,12 @@  static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data,
 	struct drm_panfrost_create_bo *args = data;

 	if (!args->size || args->pad ||
-	    (args->flags & ~PANFROST_BO_NOEXEC))
+	    (args->flags & ~(PANFROST_BO_NOEXEC | PANFROST_BO_HEAP)))
+		return -EINVAL;
+
+	/* Heaps should never be executable */
+	if ((args->flags & PANFROST_BO_HEAP) &&
+	    !(args->flags & PANFROST_BO_NOEXEC))
 		return -EINVAL;

 	bo = panfrost_gem_create_with_handle(file, dev, args->size, args->flags,
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c
index 63731f6c5223..1237fb531321 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem.c
@@ -27,6 +27,17 @@  static void panfrost_gem_free_object(struct drm_gem_object *obj)
 		drm_mm_remove_node(&bo->node);
 	spin_unlock(&pfdev->mm_lock);

+	if (bo->sgts) {
+		int i;
+		int n_sgt = bo->base.base.size / SZ_2M;
+
+		for (i = 0; i < n_sgt; i++) {
+			if (bo->sgts[i].sgl)
+				dma_unmap_sg(pfdev->dev, bo->sgts[i].sgl,
+					     bo->sgts[i].nents, DMA_BIDIRECTIONAL);
+		}
+	}
+
 	drm_gem_shmem_free_object(obj);
 }

@@ -87,7 +98,10 @@  static int panfrost_gem_map(struct panfrost_device *pfdev, struct panfrost_gem_o
 	if (ret)
 		return ret;

-	return panfrost_mmu_map(bo);
+	if (!bo->is_heap)
+		ret = panfrost_mmu_map(bo);
+
+	return ret;
 }

 struct panfrost_gem_object *
@@ -101,7 +115,11 @@  panfrost_gem_create_with_handle(struct drm_file *file_priv,
 	struct drm_gem_shmem_object *shmem;
 	struct panfrost_gem_object *bo;

-	size = roundup(size, PAGE_SIZE);
+	/* Round up heap allocations to 2MB to keep fault handling simple */
+	if (flags & PANFROST_BO_HEAP)
+		size = roundup(size, SZ_2M);
+	else
+		size = roundup(size, PAGE_SIZE);

 	shmem = drm_gem_shmem_create_with_handle(file_priv, dev, size, handle);
 	if (IS_ERR(shmem))
@@ -109,6 +127,7 @@  panfrost_gem_create_with_handle(struct drm_file *file_priv,

 	bo = to_panfrost_bo(&shmem->base);
 	bo->noexec = !!(flags & PANFROST_BO_NOEXEC);
+	bo->is_heap = !!(flags & PANFROST_BO_HEAP);

 	ret = panfrost_gem_map(pfdev, bo);
 	if (ret)
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h
index 132f02399b7b..b628c9b67784 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem.h
+++ b/drivers/gpu/drm/panfrost/panfrost_gem.h
@@ -9,10 +9,12 @@ 

 struct panfrost_gem_object {
 	struct drm_gem_shmem_object base;
+	struct sg_table *sgts;

 	struct drm_mm_node node;
 	bool is_mapped		:1;
 	bool noexec		:1;
+	bool is_heap		:1;
 };

 static inline
@@ -21,6 +23,12 @@  struct  panfrost_gem_object *to_panfrost_bo(struct drm_gem_object *obj)
 	return container_of(to_drm_gem_shmem_obj(obj), struct panfrost_gem_object, base);
 }

+static inline
+struct  panfrost_gem_object *drm_mm_node_to_panfrost_bo(struct drm_mm_node *node)
+{
+	return container_of(node, struct panfrost_gem_object, node);
+}
+
 struct drm_gem_object *panfrost_gem_create_object(struct drm_device *dev, size_t size);

 struct panfrost_gem_object *
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index eba6ce785ef0..d164eeaf39be 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -2,6 +2,7 @@ 
 /* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */
 #include <linux/bitfield.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
@@ -9,6 +10,7 @@ 
 #include <linux/iommu.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/shmem_fs.h>
 #include <linux/sizes.h>

 #include "panfrost_device.h"
@@ -235,12 +237,12 @@  void panfrost_mmu_unmap(struct panfrost_gem_object *bo)
 		size_t unmapped_page;
 		size_t pgsize = get_pgsize(iova, len - unmapped_len);

-		unmapped_page = ops->unmap(ops, iova, pgsize);
-		if (!unmapped_page)
-			break;
-
-		iova += unmapped_page;
-		unmapped_len += unmapped_page;
+		if (ops->iova_to_phys(ops, iova)) {
+			unmapped_page = ops->unmap(ops, iova, pgsize);
+			WARN_ON(unmapped_page != pgsize);
+		}
+		iova += pgsize;
+		unmapped_len += pgsize;
 	}

 	mmu_hw_do_operation(pfdev, 0, bo->node.start << PAGE_SHIFT,
@@ -276,6 +278,105 @@  static const struct iommu_gather_ops mmu_tlb_ops = {
 	.tlb_sync	= mmu_tlb_sync_context,
 };

+static struct drm_mm_node *addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr)
+{
+	struct drm_mm_node *node;
+	u64 offset = addr >> PAGE_SHIFT;
+
+	drm_mm_for_each_node(node, &pfdev->mm) {
+		if (offset >= node->start && offset < (node->start + node->size))
+			return node;
+	}
+	return NULL;
+}
+
+#define NUM_FAULT_PAGES (SZ_2M / PAGE_SIZE)
+
+int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr)
+{
+	int ret, i;
+	struct drm_mm_node *node;
+	struct panfrost_gem_object *bo;
+	struct address_space *mapping;
+	pgoff_t page_offset;
+	struct sg_table *sgt;
+	struct page **pages;
+
+	node = addr_to_drm_mm_node(pfdev, as, addr);
+	if (!node)
+		return -ENOENT;
+
+	bo = drm_mm_node_to_panfrost_bo(node);
+	if (!bo->is_heap) {
+		dev_WARN(pfdev->dev, "matching BO is not heap type (GPU VA = %llx)",
+			 node->start << PAGE_SHIFT);
+		return -EINVAL;
+	}
+	/* Assume 2MB alignment and size multiple */
+	addr &= ~((u64)SZ_2M - 1);
+	page_offset = addr >> PAGE_SHIFT;
+	page_offset -= node->start;
+
+	mutex_lock(&bo->base.pages_lock);
+
+	if (!bo->base.pages) {
+		bo->sgts = kvmalloc_array(bo->base.base.size / SZ_2M,
+				     sizeof(struct sg_table), GFP_KERNEL | __GFP_ZERO);
+		if (!bo->sgts)
+			return -ENOMEM;
+
+		pages = kvmalloc_array(bo->base.base.size >> PAGE_SHIFT,
+				       sizeof(struct page *), GFP_KERNEL | __GFP_ZERO);
+		if (!pages) {
+			kfree(bo->sgts);
+			bo->sgts = NULL;
+			return -ENOMEM;
+		}
+		bo->base.pages = pages;
+		bo->base.pages_use_count = 1;
+	} else
+		pages = bo->base.pages;
+
+	mapping = bo->base.base.filp->f_mapping;
+	mapping_set_unevictable(mapping);
+
+	for (i = page_offset; i < page_offset + NUM_FAULT_PAGES; i++) {
+		pages[i] = shmem_read_mapping_page(mapping, i);
+		if (IS_ERR(pages[i])) {
+			mutex_unlock(&bo->base.pages_lock);
+			ret = PTR_ERR(pages[i]);
+			goto err_pages;
+		}
+	}
+
+	mutex_unlock(&bo->base.pages_lock);
+
+	sgt = &bo->sgts[page_offset / (SZ_2M / PAGE_SIZE)];
+	ret = sg_alloc_table_from_pages(sgt, pages + page_offset,
+					NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL);
+	if (ret)
+		goto err_pages;
+
+	if (!dma_map_sg(pfdev->dev, sgt->sgl, sgt->nents, DMA_BIDIRECTIONAL)) {
+		ret = -EINVAL;
+		goto err_map;
+	}
+
+	mmu_map_sg(pfdev, addr, IOMMU_WRITE | IOMMU_READ | IOMMU_NOEXEC, sgt);
+
+	bo->is_mapped = true;
+
+	dev_dbg(pfdev->dev, "mapped page fault @ %llx", addr);
+
+	return 0;
+
+err_map:
+	sg_free_table(sgt);
+err_pages:
+	drm_gem_shmem_put_pages(&bo->base);
+	return ret;
+}
+
 static const char *access_type_name(struct panfrost_device *pfdev,
 		u32 fault_status)
 {
@@ -301,13 +402,11 @@  static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
 {
 	struct panfrost_device *pfdev = data;
 	u32 status = mmu_read(pfdev, MMU_INT_STAT);
-	int i;
+	int i, ret;

 	if (!status)
 		return IRQ_NONE;

-	dev_err(pfdev->dev, "mmu irq status=%x\n", status);
-
 	for (i = 0; status; i++) {
 		u32 mask = BIT(i) | BIT(i + 16);
 		u64 addr;
@@ -328,6 +427,18 @@  static irqreturn_t panfrost_mmu_irq_handler(int irq, void *data)
 		access_type = (fault_status >> 8) & 0x3;
 		source_id = (fault_status >> 16);

+		/* Page fault only */
+		if ((status & mask) == BIT(i)) {
+			WARN_ON(exception_type < 0xC1 || exception_type > 0xC4);
+
+			ret = panfrost_mmu_map_fault_addr(pfdev, i, addr);
+			if (!ret) {
+				mmu_write(pfdev, MMU_INT_CLEAR, BIT(i));
+				status &= ~mask;
+				continue;
+			}
+		}
+
 		/* terminal fault, print info about the fault */
 		dev_err(pfdev->dev,
 			"Unhandled Page fault in AS%d at VA 0x%016llX\n"
@@ -368,8 +479,9 @@  int panfrost_mmu_init(struct panfrost_device *pfdev)
 	if (irq <= 0)
 		return -ENODEV;

-	err = devm_request_irq(pfdev->dev, irq, panfrost_mmu_irq_handler,
-			       IRQF_SHARED, "mmu", pfdev);
+	err = devm_request_threaded_irq(pfdev->dev, irq, NULL,
+					panfrost_mmu_irq_handler,
+					IRQF_ONESHOT, "mmu", pfdev);

 	if (err) {
 		dev_err(pfdev->dev, "failed to request mmu irq");
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 17fb5d200f7a..9150dd75aad8 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -83,6 +83,7 @@  struct drm_panfrost_wait_bo {
 };

 #define PANFROST_BO_NOEXEC	1
+#define PANFROST_BO_HEAP	2

 /**
  * struct drm_panfrost_create_bo - ioctl argument for creating Panfrost BOs.