diff mbox

[v2,1/7] drm/tegra: Add Tegra DRM allocation API

Message ID 20161214111617.24548-2-mperttunen@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mikko Perttunen Dec. 14, 2016, 11:16 a.m. UTC
Add a new IO virtual memory allocation API to allow clients to
allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
required e.g. for loading client firmware when clients are attached
to the IOMMU domain.

The allocator allocates contiguous physical pages that are then
mapped contiguously to the IOMMU domain using the iova_domain
library provided by the kernel. Contiguous physical pages are
used so that the same allocator works also when IOMMU support
is disabled and therefore devices access physical memory directly.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/tegra/drm.h |  11 +++++
 2 files changed, 115 insertions(+), 7 deletions(-)

Comments

Lucas Stach Dec. 14, 2016, 11:35 a.m. UTC | #1
Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
> Add a new IO virtual memory allocation API to allow clients to
> allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
> required e.g. for loading client firmware when clients are attached
> to the IOMMU domain.
> 
> The allocator allocates contiguous physical pages that are then
> mapped contiguously to the IOMMU domain using the iova_domain
> library provided by the kernel. Contiguous physical pages are
> used so that the same allocator works also when IOMMU support
> is disabled and therefore devices access physical memory directly.
> 
Why is this needed? If you use the DMA API for those buffers you should
end up with CMA memory in the !IOMMU case and normal paged memory with
IOMMU enabled. From my understanding this should match the requirements.

Also how big can those firmware images get? Will __get_free_pages be
able to provide this much contig memory in a long running system with
fragmented memory space? Isn't CMA the better option here?

Regards,
Lucas

> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> ---
>  drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++++++++++---
>  drivers/gpu/drm/tegra/drm.h |  11 +++++
>  2 files changed, 115 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
> index b8be3ee..cf714c6 100644
> --- a/drivers/gpu/drm/tegra/drm.c
> +++ b/drivers/gpu/drm/tegra/drm.c
> @@ -1,12 +1,13 @@
>  /*
>   * Copyright (C) 2012 Avionic Design GmbH
> - * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
> + * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License version 2 as
>   * published by the Free Software Foundation.
>   */
>  
> +#include <linux/bitops.h>
>  #include <linux/host1x.h>
>  #include <linux/iommu.h>
>  
> @@ -23,6 +24,8 @@
>  #define DRIVER_MINOR 0
>  #define DRIVER_PATCHLEVEL 0
>  
> +#define CARVEOUT_SZ SZ_64M
> +
>  struct tegra_drm_file {
>  	struct list_head contexts;
>  };
> @@ -127,7 +130,8 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>  
>  	if (iommu_present(&platform_bus_type)) {
>  		struct iommu_domain_geometry *geometry;
> -		u64 start, end;
> +		unsigned long order;
> +		u64 carveout_start, carveout_end, gem_start, gem_end;
>  
>  		tegra->domain = iommu_domain_alloc(&platform_bus_type);
>  		if (!tegra->domain) {
> @@ -136,12 +140,25 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>  		}
>  
>  		geometry = &tegra->domain->geometry;
> -		start = geometry->aperture_start;
> -		end = geometry->aperture_end;
> +		gem_start = geometry->aperture_start;
> +		gem_end = geometry->aperture_end - CARVEOUT_SZ;
> +		carveout_start = gem_end + 1;
> +		carveout_end = geometry->aperture_end;
> +
> +		order = __ffs(tegra->domain->pgsize_bitmap);
> +		init_iova_domain(&tegra->carveout.domain, 1UL << order,
> +				 carveout_start >> order,
> +				 carveout_end >> order);
> +
> +		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
> +		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
> +
> +		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
>  
> -		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
> -				 start, end);
> -		drm_mm_init(&tegra->mm, start, end - start + 1);
> +		DRM_DEBUG("IOMMU apertures:\n");
> +		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
> +		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
> +			  carveout_end);
>  	}
>  
>  	mutex_init(&tegra->clients_lock);
> @@ -208,6 +225,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>  	if (tegra->domain) {
>  		iommu_domain_free(tegra->domain);
>  		drm_mm_takedown(&tegra->mm);
> +		put_iova_domain(&tegra->carveout.domain);
>  	}
>  free:
>  	kfree(tegra);
> @@ -232,6 +250,7 @@ static int tegra_drm_unload(struct drm_device *drm)
>  	if (tegra->domain) {
>  		iommu_domain_free(tegra->domain);
>  		drm_mm_takedown(&tegra->mm);
> +		put_iova_domain(&tegra->carveout.domain);
>  	}
>  
>  	kfree(tegra);
> @@ -975,6 +994,84 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>  	return 0;
>  }
>  
> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
> +			      dma_addr_t *dma)
> +{
> +	struct iova *alloc;
> +	void *virt;
> +	gfp_t gfp;
> +	int err;
> +
> +	if (tegra->domain)
> +		size = iova_align(&tegra->carveout.domain, size);
> +	else
> +		size = PAGE_ALIGN(size);
> +
> +	gfp = GFP_KERNEL | __GFP_ZERO;
> +	if (!tegra->domain) {
> +		/*
> +		 * Many units only support 32-bit addresses, even on 64-bit
> +		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
> +		 * virtual address space, force allocations to be in the
> +		 * lower 32-bit range.
> +		 */
> +		gfp |= GFP_DMA;
> +	}
> +
> +	virt = (void *)__get_free_pages(gfp, get_order(size));
> +	if (!virt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (!tegra->domain) {
> +		/*
> +		 * If IOMMU is disabled, devices address physical memory
> +		 * directly.
> +		 */
> +		*dma = virt_to_phys(virt);
> +		return virt;
> +	}
> +
> +	alloc = alloc_iova(&tegra->carveout.domain,
> +			   size >> tegra->carveout.shift,
> +			   tegra->carveout.limit, true);
> +	if (!alloc) {
> +		err = -EBUSY;
> +		goto free_pages;
> +	}
> +
> +	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
> +	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
> +			size, IOMMU_READ | IOMMU_WRITE);
> +	if (err < 0)
> +		goto free_iova;
> +
> +	return virt;
> +
> +free_iova:
> +	__free_iova(&tegra->carveout.domain, alloc);
> +free_pages:
> +	free_pages((unsigned long)virt, get_order(size));
> +
> +	return ERR_PTR(err);
> +}
> +
> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
> +		    dma_addr_t dma)
> +{
> +	if (tegra->domain)
> +		size = iova_align(&tegra->carveout.domain, size);
> +	else
> +		size = PAGE_ALIGN(size);
> +
> +	if (tegra->domain) {
> +		iommu_unmap(tegra->domain, dma, size);
> +		free_iova(&tegra->carveout.domain,
> +			  iova_pfn(&tegra->carveout.domain, dma));
> +	}
> +
> +	free_pages((unsigned long)virt, get_order(size));
> +}
> +
>  static int host1x_drm_probe(struct host1x_device *dev)
>  {
>  	struct drm_driver *driver = &tegra_drm_driver;
> diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
> index 0ddcce1..7351dee 100644
> --- a/drivers/gpu/drm/tegra/drm.h
> +++ b/drivers/gpu/drm/tegra/drm.h
> @@ -12,6 +12,7 @@
>  
>  #include <uapi/drm/tegra_drm.h>
>  #include <linux/host1x.h>
> +#include <linux/iova.h>
>  #include <linux/of_gpio.h>
>  
>  #include <drm/drmP.h>
> @@ -43,6 +44,12 @@ struct tegra_drm {
>  	struct iommu_domain *domain;
>  	struct drm_mm mm;
>  
> +	struct {
> +		struct iova_domain domain;
> +		unsigned long shift;
> +		unsigned long limit;
> +	} carveout;
> +
>  	struct mutex clients_lock;
>  	struct list_head clients;
>  
> @@ -104,6 +111,10 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>  int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
>  int tegra_drm_exit(struct tegra_drm *tegra);
>  
> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
> +		    dma_addr_t iova);
> +
>  struct tegra_dc_soc_info;
>  struct tegra_output;
>
Mikko Perttunen Dec. 14, 2016, 12:36 p.m. UTC | #2
On 14.12.2016 13:35, Lucas Stach wrote:
> Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
>> Add a new IO virtual memory allocation API to allow clients to
>> allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
>> required e.g. for loading client firmware when clients are attached
>> to the IOMMU domain.
>>
>> The allocator allocates contiguous physical pages that are then
>> mapped contiguously to the IOMMU domain using the iova_domain
>> library provided by the kernel. Contiguous physical pages are
>> used so that the same allocator works also when IOMMU support
>> is disabled and therefore devices access physical memory directly.
>>
> Why is this needed? If you use the DMA API for those buffers you should
> end up with CMA memory in the !IOMMU case and normal paged memory with
> IOMMU enabled. From my understanding this should match the requirements.

Yes, memory allocated with the DMA API should work as well, but would 
also require passing the device pointer for the device that is 
allocating the memory, which isn't a problem, but this way we don't need 
that.

>
> Also how big can those firmware images get? Will __get_free_pages be
> able to provide this much contig memory in a long running system with
> fragmented memory space? Isn't CMA the better option here?

The largest is about 140 kilobytes. The space is also allocated when the 
device is initialized which usually happens during boot.

>
> Regards,
> Lucas

Thanks,
Mikko.

>
>> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
>> ---
>>  drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++++++++++---
>>  drivers/gpu/drm/tegra/drm.h |  11 +++++
>>  2 files changed, 115 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
>> index b8be3ee..cf714c6 100644
>> --- a/drivers/gpu/drm/tegra/drm.c
>> +++ b/drivers/gpu/drm/tegra/drm.c
>> @@ -1,12 +1,13 @@
>>  /*
>>   * Copyright (C) 2012 Avionic Design GmbH
>> - * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
>> + * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License version 2 as
>>   * published by the Free Software Foundation.
>>   */
>>
>> +#include <linux/bitops.h>
>>  #include <linux/host1x.h>
>>  #include <linux/iommu.h>
>>
>> @@ -23,6 +24,8 @@
>>  #define DRIVER_MINOR 0
>>  #define DRIVER_PATCHLEVEL 0
>>
>> +#define CARVEOUT_SZ SZ_64M
>> +
>>  struct tegra_drm_file {
>>  	struct list_head contexts;
>>  };
>> @@ -127,7 +130,8 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>
>>  	if (iommu_present(&platform_bus_type)) {
>>  		struct iommu_domain_geometry *geometry;
>> -		u64 start, end;
>> +		unsigned long order;
>> +		u64 carveout_start, carveout_end, gem_start, gem_end;
>>
>>  		tegra->domain = iommu_domain_alloc(&platform_bus_type);
>>  		if (!tegra->domain) {
>> @@ -136,12 +140,25 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>  		}
>>
>>  		geometry = &tegra->domain->geometry;
>> -		start = geometry->aperture_start;
>> -		end = geometry->aperture_end;
>> +		gem_start = geometry->aperture_start;
>> +		gem_end = geometry->aperture_end - CARVEOUT_SZ;
>> +		carveout_start = gem_end + 1;
>> +		carveout_end = geometry->aperture_end;
>> +
>> +		order = __ffs(tegra->domain->pgsize_bitmap);
>> +		init_iova_domain(&tegra->carveout.domain, 1UL << order,
>> +				 carveout_start >> order,
>> +				 carveout_end >> order);
>> +
>> +		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
>> +		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
>> +
>> +		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
>>
>> -		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
>> -				 start, end);
>> -		drm_mm_init(&tegra->mm, start, end - start + 1);
>> +		DRM_DEBUG("IOMMU apertures:\n");
>> +		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
>> +		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
>> +			  carveout_end);
>>  	}
>>
>>  	mutex_init(&tegra->clients_lock);
>> @@ -208,6 +225,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>  	if (tegra->domain) {
>>  		iommu_domain_free(tegra->domain);
>>  		drm_mm_takedown(&tegra->mm);
>> +		put_iova_domain(&tegra->carveout.domain);
>>  	}
>>  free:
>>  	kfree(tegra);
>> @@ -232,6 +250,7 @@ static int tegra_drm_unload(struct drm_device *drm)
>>  	if (tegra->domain) {
>>  		iommu_domain_free(tegra->domain);
>>  		drm_mm_takedown(&tegra->mm);
>> +		put_iova_domain(&tegra->carveout.domain);
>>  	}
>>
>>  	kfree(tegra);
>> @@ -975,6 +994,84 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>>  	return 0;
>>  }
>>
>> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
>> +			      dma_addr_t *dma)
>> +{
>> +	struct iova *alloc;
>> +	void *virt;
>> +	gfp_t gfp;
>> +	int err;
>> +
>> +	if (tegra->domain)
>> +		size = iova_align(&tegra->carveout.domain, size);
>> +	else
>> +		size = PAGE_ALIGN(size);
>> +
>> +	gfp = GFP_KERNEL | __GFP_ZERO;
>> +	if (!tegra->domain) {
>> +		/*
>> +		 * Many units only support 32-bit addresses, even on 64-bit
>> +		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
>> +		 * virtual address space, force allocations to be in the
>> +		 * lower 32-bit range.
>> +		 */
>> +		gfp |= GFP_DMA;
>> +	}
>> +
>> +	virt = (void *)__get_free_pages(gfp, get_order(size));
>> +	if (!virt)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	if (!tegra->domain) {
>> +		/*
>> +		 * If IOMMU is disabled, devices address physical memory
>> +		 * directly.
>> +		 */
>> +		*dma = virt_to_phys(virt);
>> +		return virt;
>> +	}
>> +
>> +	alloc = alloc_iova(&tegra->carveout.domain,
>> +			   size >> tegra->carveout.shift,
>> +			   tegra->carveout.limit, true);
>> +	if (!alloc) {
>> +		err = -EBUSY;
>> +		goto free_pages;
>> +	}
>> +
>> +	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
>> +	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
>> +			size, IOMMU_READ | IOMMU_WRITE);
>> +	if (err < 0)
>> +		goto free_iova;
>> +
>> +	return virt;
>> +
>> +free_iova:
>> +	__free_iova(&tegra->carveout.domain, alloc);
>> +free_pages:
>> +	free_pages((unsigned long)virt, get_order(size));
>> +
>> +	return ERR_PTR(err);
>> +}
>> +
>> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
>> +		    dma_addr_t dma)
>> +{
>> +	if (tegra->domain)
>> +		size = iova_align(&tegra->carveout.domain, size);
>> +	else
>> +		size = PAGE_ALIGN(size);
>> +
>> +	if (tegra->domain) {
>> +		iommu_unmap(tegra->domain, dma, size);
>> +		free_iova(&tegra->carveout.domain,
>> +			  iova_pfn(&tegra->carveout.domain, dma));
>> +	}
>> +
>> +	free_pages((unsigned long)virt, get_order(size));
>> +}
>> +
>>  static int host1x_drm_probe(struct host1x_device *dev)
>>  {
>>  	struct drm_driver *driver = &tegra_drm_driver;
>> diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
>> index 0ddcce1..7351dee 100644
>> --- a/drivers/gpu/drm/tegra/drm.h
>> +++ b/drivers/gpu/drm/tegra/drm.h
>> @@ -12,6 +12,7 @@
>>
>>  #include <uapi/drm/tegra_drm.h>
>>  #include <linux/host1x.h>
>> +#include <linux/iova.h>
>>  #include <linux/of_gpio.h>
>>
>>  #include <drm/drmP.h>
>> @@ -43,6 +44,12 @@ struct tegra_drm {
>>  	struct iommu_domain *domain;
>>  	struct drm_mm mm;
>>
>> +	struct {
>> +		struct iova_domain domain;
>> +		unsigned long shift;
>> +		unsigned long limit;
>> +	} carveout;
>> +
>>  	struct mutex clients_lock;
>>  	struct list_head clients;
>>
>> @@ -104,6 +111,10 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>>  int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
>>  int tegra_drm_exit(struct tegra_drm *tegra);
>>
>> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
>> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
>> +		    dma_addr_t iova);
>> +
>>  struct tegra_dc_soc_info;
>>  struct tegra_output;
>>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Lucas Stach Dec. 14, 2016, 12:56 p.m. UTC | #3
Am Mittwoch, den 14.12.2016, 14:36 +0200 schrieb Mikko Perttunen:
> On 14.12.2016 13:35, Lucas Stach wrote:
> > Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
> >> Add a new IO virtual memory allocation API to allow clients to
> >> allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
> >> required e.g. for loading client firmware when clients are attached
> >> to the IOMMU domain.
> >>
> >> The allocator allocates contiguous physical pages that are then
> >> mapped contiguously to the IOMMU domain using the iova_domain
> >> library provided by the kernel. Contiguous physical pages are
> >> used so that the same allocator works also when IOMMU support
> >> is disabled and therefore devices access physical memory directly.
> >>
> > Why is this needed? If you use the DMA API for those buffers you should
> > end up with CMA memory in the !IOMMU case and normal paged memory with
> > IOMMU enabled. From my understanding this should match the requirements.
> 
> Yes, memory allocated with the DMA API should work as well, but would 
> also require passing the device pointer for the device that is 
> allocating the memory, which isn't a problem, but this way we don't need 
> that.
> 
Which in turn would allow you to properly describe the DMA capabilities
of engines that can address more than 32 bits without the IOMMU, which
was one of the comments Thierry had the last time around IIRC.
 
> >
> > Also how big can those firmware images get? Will __get_free_pages be
> > able to provide this much contig memory in a long running system with
> > fragmented memory space? Isn't CMA the better option here?
> 
> The largest is about 140 kilobytes. The space is also allocated when the 
> device is initialized which usually happens during boot.
> 
Which is an order 6 allocation, something you almost certainly don't get
in a fragmented system. I know it may work most of the time, as you are
allocating those buffers early, but is most of the time really enough?
What happens on module unload/load at a later time?

> >
> > Regards,
> > Lucas
> 
> Thanks,
> Mikko.
> 
> >
> >> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> >> ---
> >>  drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++++++++++---
> >>  drivers/gpu/drm/tegra/drm.h |  11 +++++
> >>  2 files changed, 115 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
> >> index b8be3ee..cf714c6 100644
> >> --- a/drivers/gpu/drm/tegra/drm.c
> >> +++ b/drivers/gpu/drm/tegra/drm.c
> >> @@ -1,12 +1,13 @@
> >>  /*
> >>   * Copyright (C) 2012 Avionic Design GmbH
> >> - * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
> >> + * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
> >>   *
> >>   * This program is free software; you can redistribute it and/or modify
> >>   * it under the terms of the GNU General Public License version 2 as
> >>   * published by the Free Software Foundation.
> >>   */
> >>
> >> +#include <linux/bitops.h>
> >>  #include <linux/host1x.h>
> >>  #include <linux/iommu.h>
> >>
> >> @@ -23,6 +24,8 @@
> >>  #define DRIVER_MINOR 0
> >>  #define DRIVER_PATCHLEVEL 0
> >>
> >> +#define CARVEOUT_SZ SZ_64M
> >> +
> >>  struct tegra_drm_file {
> >>  	struct list_head contexts;
> >>  };
> >> @@ -127,7 +130,8 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
> >>
> >>  	if (iommu_present(&platform_bus_type)) {
> >>  		struct iommu_domain_geometry *geometry;
> >> -		u64 start, end;
> >> +		unsigned long order;
> >> +		u64 carveout_start, carveout_end, gem_start, gem_end;
> >>
> >>  		tegra->domain = iommu_domain_alloc(&platform_bus_type);
> >>  		if (!tegra->domain) {
> >> @@ -136,12 +140,25 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
> >>  		}
> >>
> >>  		geometry = &tegra->domain->geometry;
> >> -		start = geometry->aperture_start;
> >> -		end = geometry->aperture_end;
> >> +		gem_start = geometry->aperture_start;
> >> +		gem_end = geometry->aperture_end - CARVEOUT_SZ;
> >> +		carveout_start = gem_end + 1;
> >> +		carveout_end = geometry->aperture_end;
> >> +
> >> +		order = __ffs(tegra->domain->pgsize_bitmap);
> >> +		init_iova_domain(&tegra->carveout.domain, 1UL << order,
> >> +				 carveout_start >> order,
> >> +				 carveout_end >> order);
> >> +
> >> +		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
> >> +		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
> >> +
> >> +		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
> >>
> >> -		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
> >> -				 start, end);
> >> -		drm_mm_init(&tegra->mm, start, end - start + 1);
> >> +		DRM_DEBUG("IOMMU apertures:\n");
> >> +		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
> >> +		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
> >> +			  carveout_end);
> >>  	}
> >>
> >>  	mutex_init(&tegra->clients_lock);
> >> @@ -208,6 +225,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
> >>  	if (tegra->domain) {
> >>  		iommu_domain_free(tegra->domain);
> >>  		drm_mm_takedown(&tegra->mm);
> >> +		put_iova_domain(&tegra->carveout.domain);
> >>  	}
> >>  free:
> >>  	kfree(tegra);
> >> @@ -232,6 +250,7 @@ static int tegra_drm_unload(struct drm_device *drm)
> >>  	if (tegra->domain) {
> >>  		iommu_domain_free(tegra->domain);
> >>  		drm_mm_takedown(&tegra->mm);
> >> +		put_iova_domain(&tegra->carveout.domain);
> >>  	}
> >>
> >>  	kfree(tegra);
> >> @@ -975,6 +994,84 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
> >>  	return 0;
> >>  }
> >>
> >> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
> >> +			      dma_addr_t *dma)
> >> +{
> >> +	struct iova *alloc;
> >> +	void *virt;
> >> +	gfp_t gfp;
> >> +	int err;
> >> +
> >> +	if (tegra->domain)
> >> +		size = iova_align(&tegra->carveout.domain, size);
> >> +	else
> >> +		size = PAGE_ALIGN(size);
> >> +
> >> +	gfp = GFP_KERNEL | __GFP_ZERO;
> >> +	if (!tegra->domain) {
> >> +		/*
> >> +		 * Many units only support 32-bit addresses, even on 64-bit
> >> +		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
> >> +		 * virtual address space, force allocations to be in the
> >> +		 * lower 32-bit range.
> >> +		 */
> >> +		gfp |= GFP_DMA;
> >> +	}
> >> +
> >> +	virt = (void *)__get_free_pages(gfp, get_order(size));
> >> +	if (!virt)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >> +	if (!tegra->domain) {
> >> +		/*
> >> +		 * If IOMMU is disabled, devices address physical memory
> >> +		 * directly.
> >> +		 */
> >> +		*dma = virt_to_phys(virt);
> >> +		return virt;
> >> +	}
> >> +
> >> +	alloc = alloc_iova(&tegra->carveout.domain,
> >> +			   size >> tegra->carveout.shift,
> >> +			   tegra->carveout.limit, true);
> >> +	if (!alloc) {
> >> +		err = -EBUSY;
> >> +		goto free_pages;
> >> +	}
> >> +
> >> +	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
> >> +	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
> >> +			size, IOMMU_READ | IOMMU_WRITE);
> >> +	if (err < 0)
> >> +		goto free_iova;
> >> +
> >> +	return virt;
> >> +
> >> +free_iova:
> >> +	__free_iova(&tegra->carveout.domain, alloc);
> >> +free_pages:
> >> +	free_pages((unsigned long)virt, get_order(size));
> >> +
> >> +	return ERR_PTR(err);
> >> +}
> >> +
> >> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
> >> +		    dma_addr_t dma)
> >> +{
> >> +	if (tegra->domain)
> >> +		size = iova_align(&tegra->carveout.domain, size);
> >> +	else
> >> +		size = PAGE_ALIGN(size);
> >> +
> >> +	if (tegra->domain) {
> >> +		iommu_unmap(tegra->domain, dma, size);
> >> +		free_iova(&tegra->carveout.domain,
> >> +			  iova_pfn(&tegra->carveout.domain, dma));
> >> +	}
> >> +
> >> +	free_pages((unsigned long)virt, get_order(size));
> >> +}
> >> +
> >>  static int host1x_drm_probe(struct host1x_device *dev)
> >>  {
> >>  	struct drm_driver *driver = &tegra_drm_driver;
> >> diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
> >> index 0ddcce1..7351dee 100644
> >> --- a/drivers/gpu/drm/tegra/drm.h
> >> +++ b/drivers/gpu/drm/tegra/drm.h
> >> @@ -12,6 +12,7 @@
> >>
> >>  #include <uapi/drm/tegra_drm.h>
> >>  #include <linux/host1x.h>
> >> +#include <linux/iova.h>
> >>  #include <linux/of_gpio.h>
> >>
> >>  #include <drm/drmP.h>
> >> @@ -43,6 +44,12 @@ struct tegra_drm {
> >>  	struct iommu_domain *domain;
> >>  	struct drm_mm mm;
> >>
> >> +	struct {
> >> +		struct iova_domain domain;
> >> +		unsigned long shift;
> >> +		unsigned long limit;
> >> +	} carveout;
> >> +
> >>  	struct mutex clients_lock;
> >>  	struct list_head clients;
> >>
> >> @@ -104,6 +111,10 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
> >>  int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
> >>  int tegra_drm_exit(struct tegra_drm *tegra);
> >>
> >> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
> >> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
> >> +		    dma_addr_t iova);
> >> +
> >>  struct tegra_dc_soc_info;
> >>  struct tegra_output;
> >>
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
Mikko Perttunen Dec. 14, 2016, 1:17 p.m. UTC | #4
On 14.12.2016 14:56, Lucas Stach wrote:
> Am Mittwoch, den 14.12.2016, 14:36 +0200 schrieb Mikko Perttunen:
>> On 14.12.2016 13:35, Lucas Stach wrote:
>>> Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
>>>> Add a new IO virtual memory allocation API to allow clients to
>>>> allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
>>>> required e.g. for loading client firmware when clients are attached
>>>> to the IOMMU domain.
>>>>
>>>> The allocator allocates contiguous physical pages that are then
>>>> mapped contiguously to the IOMMU domain using the iova_domain
>>>> library provided by the kernel. Contiguous physical pages are
>>>> used so that the same allocator works also when IOMMU support
>>>> is disabled and therefore devices access physical memory directly.
>>>>
>>> Why is this needed? If you use the DMA API for those buffers you should
>>> end up with CMA memory in the !IOMMU case and normal paged memory with
>>> IOMMU enabled. From my understanding this should match the requirements.
>>
>> Yes, memory allocated with the DMA API should work as well, but would
>> also require passing the device pointer for the device that is
>> allocating the memory, which isn't a problem, but this way we don't need
>> that.
>>
> Which in turn would allow you to properly describe the DMA capabilities
> of engines that can address more than 32 bits without the IOMMU, which
> was one of the comments Thierry had the last time around IIRC.

In this case, just the device's DMA mask wouldn't allow that, since 
chips post Tegra132 do have units that support more than 32 bits 
generally but the firmware must still be in the lower 4G.

I guess this means that we would need to set the DMA mask to the lowest 
common denominator which is 32 bits, even though we want to have the 
full 34 bits for the GEM objects. However, this should only be an issue 
when the IOMMU is disabled so it isn't really that much of a problem.

>
>>>
>>> Also how big can those firmware images get? Will __get_free_pages be
>>> able to provide this much contig memory in a long running system with
>>> fragmented memory space? Isn't CMA the better option here?
>>
>> The largest is about 140 kilobytes. The space is also allocated when the
>> device is initialized which usually happens during boot.
>>
> Which is an order 6 allocation, something you almost certainly don't get
> in a fragmented system. I know it may work most of the time, as you are
> allocating those buffers early, but is most of the time really enough?
> What happens on module unload/load at a later time?

Yeah, it's probably worth fixing.

Thanks,
Mikko.

>
>>>
>>> Regards,
>>> Lucas
>>
>> Thanks,
>> Mikko.
>>
>>>
>>>> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
>>>> ---
>>>>  drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++++++++++---
>>>>  drivers/gpu/drm/tegra/drm.h |  11 +++++
>>>>  2 files changed, 115 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
>>>> index b8be3ee..cf714c6 100644
>>>> --- a/drivers/gpu/drm/tegra/drm.c
>>>> +++ b/drivers/gpu/drm/tegra/drm.c
>>>> @@ -1,12 +1,13 @@
>>>>  /*
>>>>   * Copyright (C) 2012 Avionic Design GmbH
>>>> - * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
>>>> + * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
>>>>   *
>>>>   * This program is free software; you can redistribute it and/or modify
>>>>   * it under the terms of the GNU General Public License version 2 as
>>>>   * published by the Free Software Foundation.
>>>>   */
>>>>
>>>> +#include <linux/bitops.h>
>>>>  #include <linux/host1x.h>
>>>>  #include <linux/iommu.h>
>>>>
>>>> @@ -23,6 +24,8 @@
>>>>  #define DRIVER_MINOR 0
>>>>  #define DRIVER_PATCHLEVEL 0
>>>>
>>>> +#define CARVEOUT_SZ SZ_64M
>>>> +
>>>>  struct tegra_drm_file {
>>>>  	struct list_head contexts;
>>>>  };
>>>> @@ -127,7 +130,8 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>>>
>>>>  	if (iommu_present(&platform_bus_type)) {
>>>>  		struct iommu_domain_geometry *geometry;
>>>> -		u64 start, end;
>>>> +		unsigned long order;
>>>> +		u64 carveout_start, carveout_end, gem_start, gem_end;
>>>>
>>>>  		tegra->domain = iommu_domain_alloc(&platform_bus_type);
>>>>  		if (!tegra->domain) {
>>>> @@ -136,12 +140,25 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>>>  		}
>>>>
>>>>  		geometry = &tegra->domain->geometry;
>>>> -		start = geometry->aperture_start;
>>>> -		end = geometry->aperture_end;
>>>> +		gem_start = geometry->aperture_start;
>>>> +		gem_end = geometry->aperture_end - CARVEOUT_SZ;
>>>> +		carveout_start = gem_end + 1;
>>>> +		carveout_end = geometry->aperture_end;
>>>> +
>>>> +		order = __ffs(tegra->domain->pgsize_bitmap);
>>>> +		init_iova_domain(&tegra->carveout.domain, 1UL << order,
>>>> +				 carveout_start >> order,
>>>> +				 carveout_end >> order);
>>>> +
>>>> +		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
>>>> +		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
>>>> +
>>>> +		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
>>>>
>>>> -		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
>>>> -				 start, end);
>>>> -		drm_mm_init(&tegra->mm, start, end - start + 1);
>>>> +		DRM_DEBUG("IOMMU apertures:\n");
>>>> +		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
>>>> +		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
>>>> +			  carveout_end);
>>>>  	}
>>>>
>>>>  	mutex_init(&tegra->clients_lock);
>>>> @@ -208,6 +225,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
>>>>  	if (tegra->domain) {
>>>>  		iommu_domain_free(tegra->domain);
>>>>  		drm_mm_takedown(&tegra->mm);
>>>> +		put_iova_domain(&tegra->carveout.domain);
>>>>  	}
>>>>  free:
>>>>  	kfree(tegra);
>>>> @@ -232,6 +250,7 @@ static int tegra_drm_unload(struct drm_device *drm)
>>>>  	if (tegra->domain) {
>>>>  		iommu_domain_free(tegra->domain);
>>>>  		drm_mm_takedown(&tegra->mm);
>>>> +		put_iova_domain(&tegra->carveout.domain);
>>>>  	}
>>>>
>>>>  	kfree(tegra);
>>>> @@ -975,6 +994,84 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>>>>  	return 0;
>>>>  }
>>>>
>>>> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
>>>> +			      dma_addr_t *dma)
>>>> +{
>>>> +	struct iova *alloc;
>>>> +	void *virt;
>>>> +	gfp_t gfp;
>>>> +	int err;
>>>> +
>>>> +	if (tegra->domain)
>>>> +		size = iova_align(&tegra->carveout.domain, size);
>>>> +	else
>>>> +		size = PAGE_ALIGN(size);
>>>> +
>>>> +	gfp = GFP_KERNEL | __GFP_ZERO;
>>>> +	if (!tegra->domain) {
>>>> +		/*
>>>> +		 * Many units only support 32-bit addresses, even on 64-bit
>>>> +		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
>>>> +		 * virtual address space, force allocations to be in the
>>>> +		 * lower 32-bit range.
>>>> +		 */
>>>> +		gfp |= GFP_DMA;
>>>> +	}
>>>> +
>>>> +	virt = (void *)__get_free_pages(gfp, get_order(size));
>>>> +	if (!virt)
>>>> +		return ERR_PTR(-ENOMEM);
>>>> +
>>>> +	if (!tegra->domain) {
>>>> +		/*
>>>> +		 * If IOMMU is disabled, devices address physical memory
>>>> +		 * directly.
>>>> +		 */
>>>> +		*dma = virt_to_phys(virt);
>>>> +		return virt;
>>>> +	}
>>>> +
>>>> +	alloc = alloc_iova(&tegra->carveout.domain,
>>>> +			   size >> tegra->carveout.shift,
>>>> +			   tegra->carveout.limit, true);
>>>> +	if (!alloc) {
>>>> +		err = -EBUSY;
>>>> +		goto free_pages;
>>>> +	}
>>>> +
>>>> +	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
>>>> +	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
>>>> +			size, IOMMU_READ | IOMMU_WRITE);
>>>> +	if (err < 0)
>>>> +		goto free_iova;
>>>> +
>>>> +	return virt;
>>>> +
>>>> +free_iova:
>>>> +	__free_iova(&tegra->carveout.domain, alloc);
>>>> +free_pages:
>>>> +	free_pages((unsigned long)virt, get_order(size));
>>>> +
>>>> +	return ERR_PTR(err);
>>>> +}
>>>> +
>>>> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
>>>> +		    dma_addr_t dma)
>>>> +{
>>>> +	if (tegra->domain)
>>>> +		size = iova_align(&tegra->carveout.domain, size);
>>>> +	else
>>>> +		size = PAGE_ALIGN(size);
>>>> +
>>>> +	if (tegra->domain) {
>>>> +		iommu_unmap(tegra->domain, dma, size);
>>>> +		free_iova(&tegra->carveout.domain,
>>>> +			  iova_pfn(&tegra->carveout.domain, dma));
>>>> +	}
>>>> +
>>>> +	free_pages((unsigned long)virt, get_order(size));
>>>> +}
>>>> +
>>>>  static int host1x_drm_probe(struct host1x_device *dev)
>>>>  {
>>>>  	struct drm_driver *driver = &tegra_drm_driver;
>>>> diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
>>>> index 0ddcce1..7351dee 100644
>>>> --- a/drivers/gpu/drm/tegra/drm.h
>>>> +++ b/drivers/gpu/drm/tegra/drm.h
>>>> @@ -12,6 +12,7 @@
>>>>
>>>>  #include <uapi/drm/tegra_drm.h>
>>>>  #include <linux/host1x.h>
>>>> +#include <linux/iova.h>
>>>>  #include <linux/of_gpio.h>
>>>>
>>>>  #include <drm/drmP.h>
>>>> @@ -43,6 +44,12 @@ struct tegra_drm {
>>>>  	struct iommu_domain *domain;
>>>>  	struct drm_mm mm;
>>>>
>>>> +	struct {
>>>> +		struct iova_domain domain;
>>>> +		unsigned long shift;
>>>> +		unsigned long limit;
>>>> +	} carveout;
>>>> +
>>>>  	struct mutex clients_lock;
>>>>  	struct list_head clients;
>>>>
>>>> @@ -104,6 +111,10 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
>>>>  int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
>>>>  int tegra_drm_exit(struct tegra_drm *tegra);
>>>>
>>>> +void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
>>>> +void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
>>>> +		    dma_addr_t iova);
>>>> +
>>>>  struct tegra_dc_soc_info;
>>>>  struct tegra_output;
>>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>
>
Thierry Reding Dec. 14, 2016, 1:48 p.m. UTC | #5
On Wed, Dec 14, 2016 at 12:35:31PM +0100, Lucas Stach wrote:
> Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
> > Add a new IO virtual memory allocation API to allow clients to
> > allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
> > required e.g. for loading client firmware when clients are attached
> > to the IOMMU domain.
> > 
> > The allocator allocates contiguous physical pages that are then
> > mapped contiguously to the IOMMU domain using the iova_domain
> > library provided by the kernel. Contiguous physical pages are
> > used so that the same allocator works also when IOMMU support
> > is disabled and therefore devices access physical memory directly.
> > 
> Why is this needed? If you use the DMA API for those buffers you should
> end up with CMA memory in the !IOMMU case and normal paged memory with
> IOMMU enabled. From my understanding this should match the requirements.

We can't currently use the DMA API for these allocations because it
doesn't allow (or at least didn't back when this was first implemented)
us to share a mapping between two devices.

The reason why we need these patches is that when IOMMU is enabled, then
the units' falcons will read memory through the IOMMU, so we must have
allocations for GEM buffers and the firmware go through the same
mechanism.

Thierry
Lucas Stach Dec. 14, 2016, 2:01 p.m. UTC | #6
Am Mittwoch, den 14.12.2016, 14:48 +0100 schrieb Thierry Reding:
> On Wed, Dec 14, 2016 at 12:35:31PM +0100, Lucas Stach wrote:
> > Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
> > > Add a new IO virtual memory allocation API to allow clients to
> > > allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
> > > required e.g. for loading client firmware when clients are attached
> > > to the IOMMU domain.
> > > 
> > > The allocator allocates contiguous physical pages that are then
> > > mapped contiguously to the IOMMU domain using the iova_domain
> > > library provided by the kernel. Contiguous physical pages are
> > > used so that the same allocator works also when IOMMU support
> > > is disabled and therefore devices access physical memory directly.
> > > 
> > Why is this needed? If you use the DMA API for those buffers you should
> > end up with CMA memory in the !IOMMU case and normal paged memory with
> > IOMMU enabled. From my understanding this should match the requirements.
> 
> We can't currently use the DMA API for these allocations because it
> doesn't allow (or at least didn't back when this was first implemented)
> us to share a mapping between two devices.

Hm, maybe I'm overlooking something, but isn't this just a matter of
allocating on one device, then constructing a SG list (dma_get_sgtable)
from the buffer you got and use that to dma_map_sg() the buffer on the
other device?

Maybe doing the firmware buffer allocation on host1x (with a 4GB upper
bound) and then sharing the SG list to the devices?

> The reason why we need these patches is that when IOMMU is enabled, then
> the units' falcons will read memory through the IOMMU, so we must have
> allocations for GEM buffers and the firmware go through the same
> mechanism.

Sorry for maybe dumb questions.

Do you have any engines other than the GPU that can handle SG
themselves? Wouldn't you want the GEM objects to be backed by CMA in
the !MMU case? How are ordinary GEM objects different from the falcon
firmware?

Regards,
Lucas
Thierry Reding Dec. 14, 2016, 2:39 p.m. UTC | #7
On Wed, Dec 14, 2016 at 03:01:56PM +0100, Lucas Stach wrote:
> Am Mittwoch, den 14.12.2016, 14:48 +0100 schrieb Thierry Reding:
> > On Wed, Dec 14, 2016 at 12:35:31PM +0100, Lucas Stach wrote:
> > > Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
> > > > Add a new IO virtual memory allocation API to allow clients to
> > > > allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
> > > > required e.g. for loading client firmware when clients are attached
> > > > to the IOMMU domain.
> > > > 
> > > > The allocator allocates contiguous physical pages that are then
> > > > mapped contiguously to the IOMMU domain using the iova_domain
> > > > library provided by the kernel. Contiguous physical pages are
> > > > used so that the same allocator works also when IOMMU support
> > > > is disabled and therefore devices access physical memory directly.
> > > > 
> > > Why is this needed? If you use the DMA API for those buffers you should
> > > end up with CMA memory in the !IOMMU case and normal paged memory with
> > > IOMMU enabled. From my understanding this should match the requirements.
> > 
> > We can't currently use the DMA API for these allocations because it
> > doesn't allow (or at least didn't back when this was first implemented)
> > us to share a mapping between two devices.
> 
> Hm, maybe I'm overlooking something, but isn't this just a matter of
> allocating on one device, then constructing a SG list (dma_get_sgtable)
> from the buffer you got and use that to dma_map_sg() the buffer on the
> other device?

Yes, that would work. What I was referring to is sharing framebuffers
between multiple CRTCs. Back at the time when IOMMU support was first
added, I tried to use the DMA API. However, the problem with that was
that we would've had to effectively dma_map_sg() on every page-flip
since the buffer is imported into the DRM device, but there's no call
that would import it for each CRTC only once. So when the framebuffer
is added to a plane, you have to map it to the corresponding display
controller. And the dma_map_sg() was, if I recall correctly, on the
order of 5-10 ms, which is prohibitively expensive to do per frame.

It's also completely unnecessary because all CRTCs in a DRM device can
simply share the same IOMMU domain. I can't think of a reason why you
would want or need to use separate domains.

Back at the time this was something that the DMA API couldn't do, it
would simply assign a separate IOMMU domain per device. It's possible
that this has changed now given that many others must've run into the
same problem meanwhile.

> Maybe doing the firmware buffer allocation on host1x (with a 4GB upper
> bound) and then sharing the SG list to the devices?

That's pretty much what this API is doing. Only it's the other way
around: we don't share the SG list with other devices for mapping, we
simply reuse the same mapping across multiple devices, since they're
all in the same IOMMU domain.

> > The reason why we need these patches is that when IOMMU is enabled, then
> > the units' falcons will read memory through the IOMMU, so we must have
> > allocations for GEM buffers and the firmware go through the same
> > mechanism.
> 
> Sorry for maybe dumb questions.
> 
> Do you have any engines other than the GPU that can handle SG
> themselves?

No, I don't think so.

> Wouldn't you want the GEM objects to be backed by CMA in the !MMU
> case?

That's exactly what's happening already. If no IOMMU is available we
allocate buffer objects backing store with dma_alloc_wc().

> How are ordinary GEM objects different from the falcon firmware?

They're not. I think we could probably reuse more of the BO allocation
functions for the firmware as well. I think Mikko already agreed to look
into that. We might have to add some special cases, or split up the
helpers a little differently to avoid creating GEM objects from the
firmware buffers. We wouldn't want userspace to start mmap()'ing those.

Thierry
Mikko Perttunen Jan. 27, 2017, 10:45 p.m. UTC | #8
So with the userspace question resolved, only this question of memory 
allocation remains. I see the following options:

- Keep the current __get_free_pages-based allocation. This means that 
firmware loading may fail when memory is fragmented. The function can be 
augmented by adding vmalloc support when IOMMU is enabled, which would 
eliminate those failures.

- Move to doing CMA allocation using the DMA API. This way allocations 
would succeed more likely at least if the user has enough CMA memory.
In this case we'll need to pass an additional struct device * to the 
alloc/free functions. This also requires that the CMA memory is always 
allocated from the lower 4GB physical address range, as firmware can 
only be loaded from there (other operations support 34 bits); I don't 
know how the default CMA region is allocated, maybe I should find out.

This option would also require adding a separate path for when the IOMMU 
is enabled and CMA not available - AIUI, on Tegra, the DMA API always 
allocates from CMA, whether IOMMU is enabled or not. (Or, we could make 
CMA mandatory.)

For me, either option works. Also, sorry if I made some stupid mistake, 
my brain is currently too tired to handle multiple address spaces at the 
same time :)

Cheers,
Mikko.

On 12/14/2016 04:39 PM, Thierry Reding wrote:
> On Wed, Dec 14, 2016 at 03:01:56PM +0100, Lucas Stach wrote:
>> Am Mittwoch, den 14.12.2016, 14:48 +0100 schrieb Thierry Reding:
>>> On Wed, Dec 14, 2016 at 12:35:31PM +0100, Lucas Stach wrote:
>>>> Am Mittwoch, den 14.12.2016, 13:16 +0200 schrieb Mikko Perttunen:
>>>>> Add a new IO virtual memory allocation API to allow clients to
>>>>> allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
>>>>> required e.g. for loading client firmware when clients are attached
>>>>> to the IOMMU domain.
>>>>>
>>>>> The allocator allocates contiguous physical pages that are then
>>>>> mapped contiguously to the IOMMU domain using the iova_domain
>>>>> library provided by the kernel. Contiguous physical pages are
>>>>> used so that the same allocator works also when IOMMU support
>>>>> is disabled and therefore devices access physical memory directly.
>>>>>
>>>> Why is this needed? If you use the DMA API for those buffers you should
>>>> end up with CMA memory in the !IOMMU case and normal paged memory with
>>>> IOMMU enabled. From my understanding this should match the requirements.
>>>
>>> We can't currently use the DMA API for these allocations because it
>>> doesn't allow (or at least didn't back when this was first implemented)
>>> us to share a mapping between two devices.
>>
>> Hm, maybe I'm overlooking something, but isn't this just a matter of
>> allocating on one device, then constructing a SG list (dma_get_sgtable)
>> from the buffer you got and use that to dma_map_sg() the buffer on the
>> other device?
>
> Yes, that would work. What I was referring to is sharing framebuffers
> between multiple CRTCs. Back at the time when IOMMU support was first
> added, I tried to use the DMA API. However, the problem with that was
> that we would've had to effectively dma_map_sg() on every page-flip
> since the buffer is imported into the DRM device, but there's no call
> that would import it for each CRTC only once. So when the framebuffer
> is added to a plane, you have to map it to the corresponding display
> controller. And the dma_map_sg() was, if I recall correctly, on the
> order of 5-10 ms, which is prohibitively expensive to do per frame.
>
> It's also completely unnecessary because all CRTCs in a DRM device can
> simply share the same IOMMU domain. I can't think of a reason why you
> would want or need to use separate domains.
>
> Back at the time this was something that the DMA API couldn't do, it
> would simply assign a separate IOMMU domain per device. It's possible
> that this has changed now given that many others must've run into the
> same problem meanwhile.
>
>> Maybe doing the firmware buffer allocation on host1x (with a 4GB upper
>> bound) and then sharing the SG list to the devices?
>
> That's pretty much what this API is doing. Only it's the other way
> around: we don't share the SG list with other devices for mapping, we
> simply reuse the same mapping across multiple devices, since they're
> all in the same IOMMU domain.
>
>>> The reason why we need these patches is that when IOMMU is enabled, then
>>> the units' falcons will read memory through the IOMMU, so we must have
>>> allocations for GEM buffers and the firmware go through the same
>>> mechanism.
>>
>> Sorry for maybe dumb questions.
>>
>> Do you have any engines other than the GPU that can handle SG
>> themselves?
>
> No, I don't think so.
>
>> Wouldn't you want the GEM objects to be backed by CMA in the !MMU
>> case?
>
> That's exactly what's happening already. If no IOMMU is available we
> allocate buffer objects backing store with dma_alloc_wc().
>
>> How are ordinary GEM objects different from the falcon firmware?
>
> They're not. I think we could probably reuse more of the BO allocation
> functions for the firmware as well. I think Mikko already agreed to look
> into that. We might have to add some special cases, or split up the
> helpers a little differently to avoid creating GEM objects from the
> firmware buffers. We wouldn't want userspace to start mmap()'ing those.
>
> Thierry
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index b8be3ee..cf714c6 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1,12 +1,13 @@ 
 /*
  * Copyright (C) 2012 Avionic Design GmbH
- * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 
+#include <linux/bitops.h>
 #include <linux/host1x.h>
 #include <linux/iommu.h>
 
@@ -23,6 +24,8 @@ 
 #define DRIVER_MINOR 0
 #define DRIVER_PATCHLEVEL 0
 
+#define CARVEOUT_SZ SZ_64M
+
 struct tegra_drm_file {
 	struct list_head contexts;
 };
@@ -127,7 +130,8 @@  static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 
 	if (iommu_present(&platform_bus_type)) {
 		struct iommu_domain_geometry *geometry;
-		u64 start, end;
+		unsigned long order;
+		u64 carveout_start, carveout_end, gem_start, gem_end;
 
 		tegra->domain = iommu_domain_alloc(&platform_bus_type);
 		if (!tegra->domain) {
@@ -136,12 +140,25 @@  static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		}
 
 		geometry = &tegra->domain->geometry;
-		start = geometry->aperture_start;
-		end = geometry->aperture_end;
+		gem_start = geometry->aperture_start;
+		gem_end = geometry->aperture_end - CARVEOUT_SZ;
+		carveout_start = gem_end + 1;
+		carveout_end = geometry->aperture_end;
+
+		order = __ffs(tegra->domain->pgsize_bitmap);
+		init_iova_domain(&tegra->carveout.domain, 1UL << order,
+				 carveout_start >> order,
+				 carveout_end >> order);
+
+		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
+		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
+
+		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
 
-		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
-				 start, end);
-		drm_mm_init(&tegra->mm, start, end - start + 1);
+		DRM_DEBUG("IOMMU apertures:\n");
+		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
+		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
+			  carveout_end);
 	}
 
 	mutex_init(&tegra->clients_lock);
@@ -208,6 +225,7 @@  static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 	if (tegra->domain) {
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
+		put_iova_domain(&tegra->carveout.domain);
 	}
 free:
 	kfree(tegra);
@@ -232,6 +250,7 @@  static int tegra_drm_unload(struct drm_device *drm)
 	if (tegra->domain) {
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
+		put_iova_domain(&tegra->carveout.domain);
 	}
 
 	kfree(tegra);
@@ -975,6 +994,84 @@  int tegra_drm_unregister_client(struct tegra_drm *tegra,
 	return 0;
 }
 
+void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
+			      dma_addr_t *dma)
+{
+	struct iova *alloc;
+	void *virt;
+	gfp_t gfp;
+	int err;
+
+	if (tegra->domain)
+		size = iova_align(&tegra->carveout.domain, size);
+	else
+		size = PAGE_ALIGN(size);
+
+	gfp = GFP_KERNEL | __GFP_ZERO;
+	if (!tegra->domain) {
+		/*
+		 * Many units only support 32-bit addresses, even on 64-bit
+		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
+		 * virtual address space, force allocations to be in the
+		 * lower 32-bit range.
+		 */
+		gfp |= GFP_DMA;
+	}
+
+	virt = (void *)__get_free_pages(gfp, get_order(size));
+	if (!virt)
+		return ERR_PTR(-ENOMEM);
+
+	if (!tegra->domain) {
+		/*
+		 * If IOMMU is disabled, devices address physical memory
+		 * directly.
+		 */
+		*dma = virt_to_phys(virt);
+		return virt;
+	}
+
+	alloc = alloc_iova(&tegra->carveout.domain,
+			   size >> tegra->carveout.shift,
+			   tegra->carveout.limit, true);
+	if (!alloc) {
+		err = -EBUSY;
+		goto free_pages;
+	}
+
+	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
+	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
+			size, IOMMU_READ | IOMMU_WRITE);
+	if (err < 0)
+		goto free_iova;
+
+	return virt;
+
+free_iova:
+	__free_iova(&tegra->carveout.domain, alloc);
+free_pages:
+	free_pages((unsigned long)virt, get_order(size));
+
+	return ERR_PTR(err);
+}
+
+void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
+		    dma_addr_t dma)
+{
+	if (tegra->domain)
+		size = iova_align(&tegra->carveout.domain, size);
+	else
+		size = PAGE_ALIGN(size);
+
+	if (tegra->domain) {
+		iommu_unmap(tegra->domain, dma, size);
+		free_iova(&tegra->carveout.domain,
+			  iova_pfn(&tegra->carveout.domain, dma));
+	}
+
+	free_pages((unsigned long)virt, get_order(size));
+}
+
 static int host1x_drm_probe(struct host1x_device *dev)
 {
 	struct drm_driver *driver = &tegra_drm_driver;
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 0ddcce1..7351dee 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -12,6 +12,7 @@ 
 
 #include <uapi/drm/tegra_drm.h>
 #include <linux/host1x.h>
+#include <linux/iova.h>
 #include <linux/of_gpio.h>
 
 #include <drm/drmP.h>
@@ -43,6 +44,12 @@  struct tegra_drm {
 	struct iommu_domain *domain;
 	struct drm_mm mm;
 
+	struct {
+		struct iova_domain domain;
+		unsigned long shift;
+		unsigned long limit;
+	} carveout;
+
 	struct mutex clients_lock;
 	struct list_head clients;
 
@@ -104,6 +111,10 @@  int tegra_drm_unregister_client(struct tegra_drm *tegra,
 int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
 int tegra_drm_exit(struct tegra_drm *tegra);
 
+void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
+void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
+		    dma_addr_t iova);
+
 struct tegra_dc_soc_info;
 struct tegra_output;