diff mbox series

[2/2] drm/ttm: remove no_retry flag

Message ID 20201002113121.47823-2-christian.koenig@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/ttm: remove need_dma32 flag | expand

Commit Message

Christian König Oct. 2, 2020, 11:31 a.m. UTC
Amdgpu was the only user of this.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++---
 drivers/gpu/drm/ttm/ttm_tt.c            | 3 ---
 include/drm/ttm/ttm_device.h            | 2 --
 3 files changed, 3 insertions(+), 8 deletions(-)

Comments

Daniel Vetter Oct. 2, 2020, 12:31 p.m. UTC | #1
On Fri, Oct 2, 2020 at 1:31 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Amdgpu was the only user of this.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>

Uh this smells like a fishy band-aid. And the original commit
introducing this also doesn't sched any light on why this should
happen, and why it's specific to the amdgpu driver. Do you have some
more memories here?

I guess no retry makes sense for a "do you still have memory?" query,
but once we've commit to having that memory, I'm not seeing why we
should not try to find it? Might also tie into the lack of active
shrinking for ttm objects in the system domain.
-Daniel

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++---
>  drivers/gpu/drm/ttm/ttm_tt.c            | 3 ---
>  include/drm/ttm/ttm_device.h            | 2 --
>  3 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c5f2b4971ef7..0a4233985870 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1298,6 +1298,9 @@ static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo,
>         }
>         gtt->gobj = &bo->base;
>
> +       /* We opt to avoid OOM on system pages allocations */
> +       page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> +
>         if (dma_addressing_limited(adev->dev))
>                 page_flags |= TTM_PAGE_FLAG_DMA32;
>
> @@ -1895,9 +1898,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>         }
>         adev->mman.initialized = true;
>
> -       /* We opt to avoid OOM on system pages allocations */
> -       adev->mman.bdev.no_retry = true;
> -
>         /* Initialize VRAM pool with all of VRAM divided into pages */
>         r = amdgpu_vram_mgr_init(adev);
>         if (r) {
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index e2b1e6c53a04..98514abaa939 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -52,9 +52,6 @@ int ttm_tt_create(struct ttm_buffer_object *bo, bool zero_alloc)
>         if (bo->ttm)
>                 return 0;
>
> -       if (bdev->no_retry)
> -               page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> -
>         switch (bo->type) {
>         case ttm_bo_type_device:
>                 if (zero_alloc)
> diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
> index bfc6dd87f2d3..e0eba36c1309 100644
> --- a/include/drm/ttm/ttm_device.h
> +++ b/include/drm/ttm/ttm_device.h
> @@ -326,8 +326,6 @@ struct ttm_device {
>          */
>
>         struct delayed_work wq;
> -
> -       bool no_retry;
>  };
>
>  static inline struct ttm_resource_manager *
> --
> 2.17.1
>
Christian König Oct. 5, 2020, 2:37 p.m. UTC | #2
Am 02.10.20 um 14:31 schrieb Daniel Vetter:
> On Fri, Oct 2, 2020 at 1:31 PM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Amdgpu was the only user of this.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
> Uh this smells like a fishy band-aid. And the original commit
> introducing this also doesn't sched any light on why this should
> happen, and why it's specific to the amdgpu driver. Do you have some
> more memories here?

Nope, I briefly remember that we had a customer which ran into the OOM 
killer and instead wanted to get -ENOMEM.

But I honestly don't remember why we have it approached like that.

Christian.

>
> I guess no retry makes sense for a "do you still have memory?" query,
> but once we've commit to having that memory, I'm not seeing why we
> should not try to find it? Might also tie into the lack of active
> shrinking for ttm objects in the system domain.
> -Daniel
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++---
>>   drivers/gpu/drm/ttm/ttm_tt.c            | 3 ---
>>   include/drm/ttm/ttm_device.h            | 2 --
>>   3 files changed, 3 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c5f2b4971ef7..0a4233985870 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -1298,6 +1298,9 @@ static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo,
>>          }
>>          gtt->gobj = &bo->base;
>>
>> +       /* We opt to avoid OOM on system pages allocations */
>> +       page_flags |= TTM_PAGE_FLAG_NO_RETRY;
>> +
>>          if (dma_addressing_limited(adev->dev))
>>                  page_flags |= TTM_PAGE_FLAG_DMA32;
>>
>> @@ -1895,9 +1898,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>>          }
>>          adev->mman.initialized = true;
>>
>> -       /* We opt to avoid OOM on system pages allocations */
>> -       adev->mman.bdev.no_retry = true;
>> -
>>          /* Initialize VRAM pool with all of VRAM divided into pages */
>>          r = amdgpu_vram_mgr_init(adev);
>>          if (r) {
>> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
>> index e2b1e6c53a04..98514abaa939 100644
>> --- a/drivers/gpu/drm/ttm/ttm_tt.c
>> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
>> @@ -52,9 +52,6 @@ int ttm_tt_create(struct ttm_buffer_object *bo, bool zero_alloc)
>>          if (bo->ttm)
>>                  return 0;
>>
>> -       if (bdev->no_retry)
>> -               page_flags |= TTM_PAGE_FLAG_NO_RETRY;
>> -
>>          switch (bo->type) {
>>          case ttm_bo_type_device:
>>                  if (zero_alloc)
>> diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
>> index bfc6dd87f2d3..e0eba36c1309 100644
>> --- a/include/drm/ttm/ttm_device.h
>> +++ b/include/drm/ttm/ttm_device.h
>> @@ -326,8 +326,6 @@ struct ttm_device {
>>           */
>>
>>          struct delayed_work wq;
>> -
>> -       bool no_retry;
>>   };
>>
>>   static inline struct ttm_resource_manager *
>> --
>> 2.17.1
>>
>
Daniel Vetter Oct. 5, 2020, 2:55 p.m. UTC | #3
On Mon, Oct 5, 2020 at 4:37 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 02.10.20 um 14:31 schrieb Daniel Vetter:
> > On Fri, Oct 2, 2020 at 1:31 PM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Amdgpu was the only user of this.
> >>
> >> Signed-off-by: Christian König <christian.koenig@amd.com>
> > Uh this smells like a fishy band-aid. And the original commit
> > introducing this also doesn't sched any light on why this should
> > happen, and why it's specific to the amdgpu driver. Do you have some
> > more memories here?
>
> Nope, I briefly remember that we had a customer which ran into the OOM
> killer and instead wanted to get -ENOMEM.
>
> But I honestly don't remember why we have it approached like that.

Well oom killer being supremely unpopular is kinda not news. I think
what you want is that in the buffer create ioctl you don't retry, but
instead fall over if there's no memory. So that userspace knows it
can't allocate more gpu memory.

But in execbuf not trying to find the memory we promised is totally
there is kinda rude. So I think this should be a runtime flag, perhaps
in the ttm_operation_ctx?

The other thing which is really nasty is if we add a shrinker for
SYSTEM objects (using trylocks and all that), and maybe throw out the
swapped shrinker completely and only rely on that first one. Since
when that happens you do want to shrink excessive drag, but not too
much (but I think that should still be covered by the NO_RETRY flag,
iirc that means "shrink a bit, but dont get desperate"). But that's
kinda a bigger discussion.
-Daniel

>
> Christian.
>
> >
> > I guess no retry makes sense for a "do you still have memory?" query,
> > but once we've commit to having that memory, I'm not seeing why we
> > should not try to find it? Might also tie into the lack of active
> > shrinking for ttm objects in the system domain.
> > -Daniel
> >
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++---
> >>   drivers/gpu/drm/ttm/ttm_tt.c            | 3 ---
> >>   include/drm/ttm/ttm_device.h            | 2 --
> >>   3 files changed, 3 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> index c5f2b4971ef7..0a4233985870 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> @@ -1298,6 +1298,9 @@ static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo,
> >>          }
> >>          gtt->gobj = &bo->base;
> >>
> >> +       /* We opt to avoid OOM on system pages allocations */
> >> +       page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> >> +
> >>          if (dma_addressing_limited(adev->dev))
> >>                  page_flags |= TTM_PAGE_FLAG_DMA32;
> >>
> >> @@ -1895,9 +1898,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
> >>          }
> >>          adev->mman.initialized = true;
> >>
> >> -       /* We opt to avoid OOM on system pages allocations */
> >> -       adev->mman.bdev.no_retry = true;
> >> -
> >>          /* Initialize VRAM pool with all of VRAM divided into pages */
> >>          r = amdgpu_vram_mgr_init(adev);
> >>          if (r) {
> >> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> >> index e2b1e6c53a04..98514abaa939 100644
> >> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> >> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> >> @@ -52,9 +52,6 @@ int ttm_tt_create(struct ttm_buffer_object *bo, bool zero_alloc)
> >>          if (bo->ttm)
> >>                  return 0;
> >>
> >> -       if (bdev->no_retry)
> >> -               page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> >> -
> >>          switch (bo->type) {
> >>          case ttm_bo_type_device:
> >>                  if (zero_alloc)
> >> diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
> >> index bfc6dd87f2d3..e0eba36c1309 100644
> >> --- a/include/drm/ttm/ttm_device.h
> >> +++ b/include/drm/ttm/ttm_device.h
> >> @@ -326,8 +326,6 @@ struct ttm_device {
> >>           */
> >>
> >>          struct delayed_work wq;
> >> -
> >> -       bool no_retry;
> >>   };
> >>
> >>   static inline struct ttm_resource_manager *
> >> --
> >> 2.17.1
> >>
> >
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c5f2b4971ef7..0a4233985870 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1298,6 +1298,9 @@  static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo,
 	}
 	gtt->gobj = &bo->base;
 
+	/* We opt to avoid OOM on system pages allocations */
+	page_flags |= TTM_PAGE_FLAG_NO_RETRY;
+
 	if (dma_addressing_limited(adev->dev))
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
@@ -1895,9 +1898,6 @@  int amdgpu_ttm_init(struct amdgpu_device *adev)
 	}
 	adev->mman.initialized = true;
 
-	/* We opt to avoid OOM on system pages allocations */
-	adev->mman.bdev.no_retry = true;
-
 	/* Initialize VRAM pool with all of VRAM divided into pages */
 	r = amdgpu_vram_mgr_init(adev);
 	if (r) {
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index e2b1e6c53a04..98514abaa939 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -52,9 +52,6 @@  int ttm_tt_create(struct ttm_buffer_object *bo, bool zero_alloc)
 	if (bo->ttm)
 		return 0;
 
-	if (bdev->no_retry)
-		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
-
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
index bfc6dd87f2d3..e0eba36c1309 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -326,8 +326,6 @@  struct ttm_device {
 	 */
 
 	struct delayed_work wq;
-
-	bool no_retry;
 };
 
 static inline struct ttm_resource_manager *