diff mbox

[5/6] drm/amdgpu: use TTM_PL_FLAG_CONTIGUOUS

Message ID 1490953652-3703-5-git-send-email-deathsimple@vodafone.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christian König March 31, 2017, 9:47 a.m. UTC
From: Christian König <christian.koenig@amd.com>

Implement AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS using TTM_PL_FLAG_CONTIGUOUS
instead of a placement limit. That allows us to better handle CPU
accessible placements.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Michel Dänzer <michel.daenzer@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 11 +++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 14 ++++++++++----
 2 files changed, 15 insertions(+), 10 deletions(-)

Comments

Nicolai Hähnle April 3, 2017, 4:22 p.m. UTC | #1
On 31.03.2017 11:47, Christian König wrote:
> From: Christian König <christian.koenig@amd.com>
>
> Implement AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS using TTM_PL_FLAG_CONTIGUOUS
> instead of a placement limit. That allows us to better handle CPU
> accessible placements.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> Acked-by: Michel Dänzer <michel.daenzer@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 11 +++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 14 ++++++++++----
>  2 files changed, 15 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index d6b2de9..387d190 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -122,20 +122,19 @@ static void amdgpu_ttm_placement_init(struct amdgpu_device *adev,
>
>  	if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>  		unsigned visible_pfn = adev->mc.visible_vram_size >> PAGE_SHIFT;
> -		unsigned lpfn = 0;
> -
> -		/* This forces a reallocation if the flag wasn't set before */
> -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
> -			lpfn = adev->mc.real_vram_size >> PAGE_SHIFT;
>
>  		places[c].fpfn = 0;
> -		places[c].lpfn = lpfn;
> +		places[c].lpfn = 0;
>  		places[c].flags = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED |
>  			TTM_PL_FLAG_VRAM;
> +
>  		if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
>  			places[c].lpfn = visible_pfn;
>  		else
>  			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
> +
> +		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
> +			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>  		c++;
>  	}
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index d710226..af2d172 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -93,7 +93,6 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
>  			       const struct ttm_place *place,
>  			       struct ttm_mem_reg *mem)
>  {
> -	struct amdgpu_bo *bo = container_of(tbo, struct amdgpu_bo, tbo);
>  	struct amdgpu_vram_mgr *mgr = man->priv;
>  	struct drm_mm *mm = &mgr->mm;
>  	struct drm_mm_node *nodes;
> @@ -107,8 +106,8 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
>  	if (!lpfn)
>  		lpfn = man->size;
>
> -	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
> -	    place->lpfn || amdgpu_vram_page_split == -1) {
> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS ||
> +	    amdgpu_vram_page_split == -1) {
>  		pages_per_node = ~0ul;
>  		num_nodes = 1;
>  	} else {
> @@ -126,12 +125,14 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
>  		aflags = DRM_MM_CREATE_TOP;
>  	}
>
> +	mem->start = 0;
>  	pages_left = mem->num_pages;
>
>  	spin_lock(&mgr->lock);
>  	for (i = 0; i < num_nodes; ++i) {
>  		unsigned long pages = min(pages_left, pages_per_node);
>  		uint32_t alignment = mem->page_alignment;
> +		unsigned long start;
>
>  		if (pages == pages_per_node)
>  			alignment = pages_per_node;
> @@ -145,11 +146,16 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
>  		if (unlikely(r))
>  			goto error;
>
> +		/*
> +		 * Calculate a virtual BO start address to easily check if
> +		 * everything is CPU accessible.
> +		 */
> +		start = nodes[i].start + nodes[i].size - mem->num_pages;

This might wrap around (be a signed negative number), completely 
breaking the max() logic below.

> +		mem->start = max(mem->start, start);
>  		pages_left -= pages;
>  	}
>  	spin_unlock(&mgr->lock);
>
> -	mem->start = num_nodes == 1 ? nodes[0].start : AMDGPU_BO_INVALID_OFFSET;

If we're going to abuse mem->start anyway, might I suggest just keeping 
track of max(nodes[i].start + nodes[i].size), and then setting 
mem->start to a magic (macro'd) constant based on whether everything is 
in visible VRAM or not?

Then the check in amdgpu_ttm_io_mem_reserve could be simplified accordingly.

Also, I think patches #6 and #5 should be exchanged, otherwise there's a 
temporary bug in handling split visible VRAM buffers.

Cheers,
Nicolai


>  	mem->mm_node = nodes;
>
>  	return 0;
>
Christian König April 4, 2017, 11:33 a.m. UTC | #2
Am 03.04.2017 um 18:22 schrieb Nicolai Hähnle:
> On 31.03.2017 11:47, Christian König wrote:
>> From: Christian König <christian.koenig@amd.com>
>>
>> Implement AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS using TTM_PL_FLAG_CONTIGUOUS
>> instead of a placement limit. That allows us to better handle CPU
>> accessible placements.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> Acked-by: Michel Dänzer <michel.daenzer@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 11 +++++------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 14 ++++++++++----
>>  2 files changed, 15 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index d6b2de9..387d190 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -122,20 +122,19 @@ static void amdgpu_ttm_placement_init(struct 
>> amdgpu_device *adev,
>>
>>      if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>>          unsigned visible_pfn = adev->mc.visible_vram_size >> 
>> PAGE_SHIFT;
>> -        unsigned lpfn = 0;
>> -
>> -        /* This forces a reallocation if the flag wasn't set before */
>> -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> -            lpfn = adev->mc.real_vram_size >> PAGE_SHIFT;
>>
>>          places[c].fpfn = 0;
>> -        places[c].lpfn = lpfn;
>> +        places[c].lpfn = 0;
>>          places[c].flags = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED |
>>              TTM_PL_FLAG_VRAM;
>> +
>>          if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
>>              places[c].lpfn = visible_pfn;
>>          else
>>              places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>> +
>> +        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> +            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>          c++;
>>      }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index d710226..af2d172 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -93,7 +93,6 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_mem_type_manager *man,
>>                     const struct ttm_place *place,
>>                     struct ttm_mem_reg *mem)
>>  {
>> -    struct amdgpu_bo *bo = container_of(tbo, struct amdgpu_bo, tbo);
>>      struct amdgpu_vram_mgr *mgr = man->priv;
>>      struct drm_mm *mm = &mgr->mm;
>>      struct drm_mm_node *nodes;
>> @@ -107,8 +106,8 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_mem_type_manager *man,
>>      if (!lpfn)
>>          lpfn = man->size;
>>
>> -    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> -        place->lpfn || amdgpu_vram_page_split == -1) {
>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS ||
>> +        amdgpu_vram_page_split == -1) {
>>          pages_per_node = ~0ul;
>>          num_nodes = 1;
>>      } else {
>> @@ -126,12 +125,14 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_mem_type_manager *man,
>>          aflags = DRM_MM_CREATE_TOP;
>>      }
>>
>> +    mem->start = 0;
>>      pages_left = mem->num_pages;
>>
>>      spin_lock(&mgr->lock);
>>      for (i = 0; i < num_nodes; ++i) {
>>          unsigned long pages = min(pages_left, pages_per_node);
>>          uint32_t alignment = mem->page_alignment;
>> +        unsigned long start;
>>
>>          if (pages == pages_per_node)
>>              alignment = pages_per_node;
>> @@ -145,11 +146,16 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_mem_type_manager *man,
>>          if (unlikely(r))
>>              goto error;
>>
>> +        /*
>> +         * Calculate a virtual BO start address to easily check if
>> +         * everything is CPU accessible.
>> +         */
>> +        start = nodes[i].start + nodes[i].size - mem->num_pages;
>
> This might wrap around (be a signed negative number), completely 
> breaking the max() logic below.

Good point, going to fix that.

>
>> +        mem->start = max(mem->start, start);
>>          pages_left -= pages;
>>      }
>>      spin_unlock(&mgr->lock);
>>
>> -    mem->start = num_nodes == 1 ? nodes[0].start : 
>> AMDGPU_BO_INVALID_OFFSET;
>
> If we're going to abuse mem->start anyway, might I suggest just 
> keeping track of max(nodes[i].start + nodes[i].size), and then setting 
> mem->start to a magic (macro'd) constant based on whether everything 
> is in visible VRAM or not?
>

No, that would break in kernel mappings.

> Then the check in amdgpu_ttm_io_mem_reserve could be simplified 
> accordingly.
>
> Also, I think patches #6 and #5 should be exchanged, otherwise there's 
> a temporary bug in handling split visible VRAM buffers.

Hui? Why? Patch #6 enables the whole thing by not making the contiguous 
flag mandatory for CPU mappings any more.

Switching those would cause problems with detecting when a BO is not in 
visible VRAM.

Regards,
Christian.
Nicolai Hähnle April 4, 2017, 2:24 p.m. UTC | #3
On 04.04.2017 13:33, Christian König wrote:
> Am 03.04.2017 um 18:22 schrieb Nicolai Hähnle:
>> On 31.03.2017 11:47, Christian König wrote:
>>> From: Christian König <christian.koenig@amd.com>
>>>
>>> Implement AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS using TTM_PL_FLAG_CONTIGUOUS
>>> instead of a placement limit. That allows us to better handle CPU
>>> accessible placements.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> Acked-by: Michel Dänzer <michel.daenzer@amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 11 +++++------
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 14 ++++++++++----
>>>  2 files changed, 15 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index d6b2de9..387d190 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -122,20 +122,19 @@ static void amdgpu_ttm_placement_init(struct
>>> amdgpu_device *adev,
>>>
>>>      if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>>>          unsigned visible_pfn = adev->mc.visible_vram_size >>
>>> PAGE_SHIFT;
>>> -        unsigned lpfn = 0;
>>> -
>>> -        /* This forces a reallocation if the flag wasn't set before */
>>> -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>> -            lpfn = adev->mc.real_vram_size >> PAGE_SHIFT;
>>>
>>>          places[c].fpfn = 0;
>>> -        places[c].lpfn = lpfn;
>>> +        places[c].lpfn = 0;
>>>          places[c].flags = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED |
>>>              TTM_PL_FLAG_VRAM;
>>> +
>>>          if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
>>>              places[c].lpfn = visible_pfn;
>>>          else
>>>              places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>> +
>>> +        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>> +            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>          c++;
>>>      }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index d710226..af2d172 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -93,7 +93,6 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_mem_type_manager *man,
>>>                     const struct ttm_place *place,
>>>                     struct ttm_mem_reg *mem)
>>>  {
>>> -    struct amdgpu_bo *bo = container_of(tbo, struct amdgpu_bo, tbo);
>>>      struct amdgpu_vram_mgr *mgr = man->priv;
>>>      struct drm_mm *mm = &mgr->mm;
>>>      struct drm_mm_node *nodes;
>>> @@ -107,8 +106,8 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_mem_type_manager *man,
>>>      if (!lpfn)
>>>          lpfn = man->size;
>>>
>>> -    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>>> -        place->lpfn || amdgpu_vram_page_split == -1) {
>>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS ||
>>> +        amdgpu_vram_page_split == -1) {
>>>          pages_per_node = ~0ul;
>>>          num_nodes = 1;
>>>      } else {
>>> @@ -126,12 +125,14 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_mem_type_manager *man,
>>>          aflags = DRM_MM_CREATE_TOP;
>>>      }
>>>
>>> +    mem->start = 0;
>>>      pages_left = mem->num_pages;
>>>
>>>      spin_lock(&mgr->lock);
>>>      for (i = 0; i < num_nodes; ++i) {
>>>          unsigned long pages = min(pages_left, pages_per_node);
>>>          uint32_t alignment = mem->page_alignment;
>>> +        unsigned long start;
>>>
>>>          if (pages == pages_per_node)
>>>              alignment = pages_per_node;
>>> @@ -145,11 +146,16 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_mem_type_manager *man,
>>>          if (unlikely(r))
>>>              goto error;
>>>
>>> +        /*
>>> +         * Calculate a virtual BO start address to easily check if
>>> +         * everything is CPU accessible.
>>> +         */
>>> +        start = nodes[i].start + nodes[i].size - mem->num_pages;
>>
>> This might wrap around (be a signed negative number), completely
>> breaking the max() logic below.
>
> Good point, going to fix that.
>
>>
>>> +        mem->start = max(mem->start, start);
>>>          pages_left -= pages;
>>>      }
>>>      spin_unlock(&mgr->lock);
>>>
>>> -    mem->start = num_nodes == 1 ? nodes[0].start :
>>> AMDGPU_BO_INVALID_OFFSET;
>>
>> If we're going to abuse mem->start anyway, might I suggest just
>> keeping track of max(nodes[i].start + nodes[i].size), and then setting
>> mem->start to a magic (macro'd) constant based on whether everything
>> is in visible VRAM or not?
>>
>
> No, that would break in kernel mappings.
>
>> Then the check in amdgpu_ttm_io_mem_reserve could be simplified
>> accordingly.
>>
>> Also, I think patches #6 and #5 should be exchanged, otherwise there's
>> a temporary bug in handling split visible VRAM buffers.
>
> Hui? Why? Patch #6 enables the whole thing by not making the contiguous
> flag mandatory for CPU mappings any more.

Ah, I missed the fact that it's guarded by the check in 
amdgpu_bo_fault_reserve_notify. You're right, the order of patches is good.

Cheers,
Nicolai


>
> Switching those would cause problems with detecting when a BO is not in
> visible VRAM.
>
> Regards,
> Christian.
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index d6b2de9..387d190 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -122,20 +122,19 @@  static void amdgpu_ttm_placement_init(struct amdgpu_device *adev,
 
 	if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
 		unsigned visible_pfn = adev->mc.visible_vram_size >> PAGE_SHIFT;
-		unsigned lpfn = 0;
-
-		/* This forces a reallocation if the flag wasn't set before */
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-			lpfn = adev->mc.real_vram_size >> PAGE_SHIFT;
 
 		places[c].fpfn = 0;
-		places[c].lpfn = lpfn;
+		places[c].lpfn = 0;
 		places[c].flags = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED |
 			TTM_PL_FLAG_VRAM;
+
 		if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
 			places[c].lpfn = visible_pfn;
 		else
 			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
+
+		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
 		c++;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index d710226..af2d172 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -93,7 +93,6 @@  static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
 			       const struct ttm_place *place,
 			       struct ttm_mem_reg *mem)
 {
-	struct amdgpu_bo *bo = container_of(tbo, struct amdgpu_bo, tbo);
 	struct amdgpu_vram_mgr *mgr = man->priv;
 	struct drm_mm *mm = &mgr->mm;
 	struct drm_mm_node *nodes;
@@ -107,8 +106,8 @@  static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
 	if (!lpfn)
 		lpfn = man->size;
 
-	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
-	    place->lpfn || amdgpu_vram_page_split == -1) {
+	if (place->flags & TTM_PL_FLAG_CONTIGUOUS ||
+	    amdgpu_vram_page_split == -1) {
 		pages_per_node = ~0ul;
 		num_nodes = 1;
 	} else {
@@ -126,12 +125,14 @@  static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
 		aflags = DRM_MM_CREATE_TOP;
 	}
 
+	mem->start = 0;
 	pages_left = mem->num_pages;
 
 	spin_lock(&mgr->lock);
 	for (i = 0; i < num_nodes; ++i) {
 		unsigned long pages = min(pages_left, pages_per_node);
 		uint32_t alignment = mem->page_alignment;
+		unsigned long start;
 
 		if (pages == pages_per_node)
 			alignment = pages_per_node;
@@ -145,11 +146,16 @@  static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man,
 		if (unlikely(r))
 			goto error;
 
+		/*
+		 * Calculate a virtual BO start address to easily check if
+		 * everything is CPU accessible.
+		 */
+		start = nodes[i].start + nodes[i].size - mem->num_pages;
+		mem->start = max(mem->start, start);
 		pages_left -= pages;
 	}
 	spin_unlock(&mgr->lock);
 
-	mem->start = num_nodes == 1 ? nodes[0].start : AMDGPU_BO_INVALID_OFFSET;
 	mem->mm_node = nodes;
 
 	return 0;