Message ID | 20231031134059.171277-3-ishitatsuyuki@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/amdgpu: Add flag to disable implicit sync for GEM operations. | expand |
Am 31.10.23 um 14:40 schrieb Tatsuyuki Ishi: > In short, eviction never really belonged to the vm_status state machine. I strongly disagree to that. > Even when evicted, the BO could belong to either the moved or done state. > The "evicted" state needed to handle both cases, causing greater confusion. > > Additionally, there were inconsistencies in the definition of an evicted > BO. Some places are based on the `evict` parameter passed from the TTM move > callback, while the others were updated based on whether the BO got its > optimal placement. The second is more accurate for our use case. With this > refactor, the evicted state is solely determined by the second rule. That strongly sounds like you don't understand what the evicted state it good for. The evicted state is for page directories, page tables and per VM BOs which needs to move around before doing the next CS. Please further explain what you try to do here. Regards, Christian. > > Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +++++++++-------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 1 + > 3 files changed, 29 insertions(+), 40 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 7b9762f1cddd..dd6f72e2a1d6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -174,19 +174,23 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, > * State for PDs/PTs and per VM BOs which are not at the location they should > * be. > */ > -static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) > +static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted) > { > struct amdgpu_vm *vm = vm_bo->vm; > struct amdgpu_bo *bo = vm_bo->bo; > > - vm_bo->moved = true; > spin_lock(&vm_bo->vm->status_lock); > - if (bo->tbo.type == ttm_bo_type_kernel) > - list_move(&vm_bo->vm_status, &vm->evicted); > - else > - list_move_tail(&vm_bo->vm_status, &vm->evicted); > + if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { > + if (bo->tbo.type == ttm_bo_type_kernel) > + list_move(&vm_bo->eviction_status, &vm->evicted); > + else > + list_move_tail(&vm_bo->eviction_status, &vm->evicted); > + } else { > + list_del_init(&vm_bo->eviction_status); > + } > spin_unlock(&vm_bo->vm->status_lock); > } > + > /** > * amdgpu_vm_bo_moved - vm_bo is moved > * > @@ -310,6 +314,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, > base->bo = bo; > base->next = NULL; > INIT_LIST_HEAD(&base->vm_status); > + INIT_LIST_HEAD(&base->eviction_status); > > if (!bo) > return; > @@ -336,7 +341,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, > * is currently evicted. add the bo to the evicted list to make sure it > * is validated on next vm use to avoid fault. > * */ > - amdgpu_vm_bo_evicted(base); > + amdgpu_vm_bo_set_evicted(base, true); > } > > /** > @@ -460,7 +465,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, > while (!list_empty(&vm->evicted)) { > bo_base = list_first_entry(&vm->evicted, > struct amdgpu_vm_bo_base, > - vm_status); > + eviction_status); > spin_unlock(&vm->status_lock); > > bo = bo_base->bo; > @@ -1034,7 +1039,7 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, > list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) > amdgpu_vm_bo_get_memory(bo_va, stats); > > - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) > + list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.eviction_status) > amdgpu_vm_bo_get_memory(bo_va, stats); > > list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) > @@ -1153,21 +1158,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, > return r; > } > > - /* If the BO is not in its preferred location add it back to > - * the evicted list so that it gets validated again on the > - * next command submission. > - */ > - if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { > - uint32_t mem_type = bo->tbo.resource->mem_type; > - > - if (!(bo->preferred_domains & > - amdgpu_mem_type_to_domain(mem_type))) > - amdgpu_vm_bo_evicted(&bo_va->base); > - else > - amdgpu_vm_bo_idle(&bo_va->base); > - } else { > + if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) > + amdgpu_vm_bo_idle(&bo_va->base); > + else > amdgpu_vm_bo_done(&bo_va->base); > - } > > list_splice_init(&bo_va->invalids, &bo_va->valids); > bo_va->cleared = clear; > @@ -1883,6 +1877,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, > > spin_lock(&vm->status_lock); > list_del(&bo_va->base.vm_status); > + list_del(&bo_va->base.eviction_status); > spin_unlock(&vm->status_lock); > > list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { > @@ -1959,13 +1954,18 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, > if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) > bo = bo->parent; > > + /* If the BO is not in its preferred location add it back to > + * the evicted list so that it gets validated again on the > + * next command submission. > + */ > + uint32_t mem_type = bo->tbo.resource->mem_type; > + bool suboptimal = !(bo->preferred_domains & > + amdgpu_mem_type_to_domain(mem_type)); > + > for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { > struct amdgpu_vm *vm = bo_base->vm; > > - if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { > - amdgpu_vm_bo_evicted(bo_base); > - continue; > - } > + amdgpu_vm_bo_set_evicted(bo_base, suboptimal); > > if (bo_base->moved) > continue; > @@ -2648,13 +2648,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) > { > struct amdgpu_bo_va *bo_va, *tmp; > u64 total_idle = 0; > - u64 total_evicted = 0; > u64 total_relocated = 0; > u64 total_moved = 0; > u64 total_invalidated = 0; > u64 total_done = 0; > unsigned int total_idle_objs = 0; > - unsigned int total_evicted_objs = 0; > unsigned int total_relocated_objs = 0; > unsigned int total_moved_objs = 0; > unsigned int total_invalidated_objs = 0; > @@ -2671,15 +2669,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) > total_idle_objs = id; > id = 0; > > - seq_puts(m, "\tEvicted BOs:\n"); > - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) { > - if (!bo_va->base.bo) > - continue; > - total_evicted += amdgpu_bo_print_info(id++, bo_va->base.bo, m); > - } > - total_evicted_objs = id; > - id = 0; > - > seq_puts(m, "\tRelocated BOs:\n"); > list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) { > if (!bo_va->base.bo) > @@ -2718,8 +2707,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) > > seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, > total_idle_objs); > - seq_printf(m, "\tTotal evicted size: %12lld\tobjs:\t%d\n", total_evicted, > - total_evicted_objs); > seq_printf(m, "\tTotal relocated size: %12lld\tobjs:\t%d\n", total_relocated, > total_relocated_objs); > seq_printf(m, "\tTotal moved size: %12lld\tobjs:\t%d\n", total_moved, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 204ab13184ed..d9ab97eabda9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -156,6 +156,7 @@ struct amdgpu_vm_bo_base { > > /* protected by spinlock */ > struct list_head vm_status; > + struct list_head eviction_status; > > /* protected by the BO being reserved */ > bool moved; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > index 96d601e209b8..f78f4040f466 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > @@ -652,6 +652,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) > > spin_lock(&entry->vm->status_lock); > list_del(&entry->vm_status); > + list_del(&entry->eviction_status); > spin_unlock(&entry->vm->status_lock); > amdgpu_bo_unref(&entry->bo); > }
> On Oct 31, 2023, at 22:55, Christian König <christian.koenig@amd.com> wrote: > > Am 31.10.23 um 14:40 schrieb Tatsuyuki Ishi: >> In short, eviction never really belonged to the vm_status state machine. > > I strongly disagree to that. > >> Even when evicted, the BO could belong to either the moved or done state. >> The "evicted" state needed to handle both cases, causing greater confusion. >> >> Additionally, there were inconsistencies in the definition of an evicted >> BO. Some places are based on the `evict` parameter passed from the TTM move >> callback, while the others were updated based on whether the BO got its >> optimal placement. The second is more accurate for our use case. With this >> refactor, the evicted state is solely determined by the second rule. > > That strongly sounds like you don't understand what the evicted state it good for. > > The evicted state is for page directories, page tables and per VM BOs which needs to move around before doing the next CS. > > Please further explain what you try to do here. This is mainly an attempt to address inconsistency in the definition of “eviction”. The TTM move callback sets eviction when eviction happens through ttm_bo_evict. This is however not the only way a BO might end up outside its preferred domains. amdgpu_vm_bo_update later updates the eviction state based on whether the BO is in its preferred domains. In my understanding this includes all cases where the BO is evicted through ttm_bo_evict. Therefore we should apply this definition right from the move callback, not only after amdgpu_vm_bo_update has been called at least once. Tatsuyuki. > > Regards, > Christian. > >> >> Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +++++++++-------------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 1 + >> 3 files changed, 29 insertions(+), 40 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> index 7b9762f1cddd..dd6f72e2a1d6 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> @@ -174,19 +174,23 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, >> * State for PDs/PTs and per VM BOs which are not at the location they should >> * be. >> */ >> -static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) >> +static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted) >> { >> struct amdgpu_vm *vm = vm_bo->vm; >> struct amdgpu_bo *bo = vm_bo->bo; >> - vm_bo->moved = true; >> spin_lock(&vm_bo->vm->status_lock); >> - if (bo->tbo.type == ttm_bo_type_kernel) >> - list_move(&vm_bo->vm_status, &vm->evicted); >> - else >> - list_move_tail(&vm_bo->vm_status, &vm->evicted); >> + if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >> + if (bo->tbo.type == ttm_bo_type_kernel) >> + list_move(&vm_bo->eviction_status, &vm->evicted); >> + else >> + list_move_tail(&vm_bo->eviction_status, &vm->evicted); >> + } else { >> + list_del_init(&vm_bo->eviction_status); >> + } >> spin_unlock(&vm_bo->vm->status_lock); >> } >> + >> /** >> * amdgpu_vm_bo_moved - vm_bo is moved >> * >> @@ -310,6 +314,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, >> base->bo = bo; >> base->next = NULL; >> INIT_LIST_HEAD(&base->vm_status); >> + INIT_LIST_HEAD(&base->eviction_status); >> if (!bo) >> return; >> @@ -336,7 +341,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, >> * is currently evicted. add the bo to the evicted list to make sure it >> * is validated on next vm use to avoid fault. >> * */ >> - amdgpu_vm_bo_evicted(base); >> + amdgpu_vm_bo_set_evicted(base, true); >> } >> /** >> @@ -460,7 +465,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, >> while (!list_empty(&vm->evicted)) { >> bo_base = list_first_entry(&vm->evicted, >> struct amdgpu_vm_bo_base, >> - vm_status); >> + eviction_status); >> spin_unlock(&vm->status_lock); >> bo = bo_base->bo; >> @@ -1034,7 +1039,7 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, >> list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) >> amdgpu_vm_bo_get_memory(bo_va, stats); >> - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) >> + list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.eviction_status) >> amdgpu_vm_bo_get_memory(bo_va, stats); >> list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) >> @@ -1153,21 +1158,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, >> return r; >> } >> - /* If the BO is not in its preferred location add it back to >> - * the evicted list so that it gets validated again on the >> - * next command submission. >> - */ >> - if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >> - uint32_t mem_type = bo->tbo.resource->mem_type; >> - >> - if (!(bo->preferred_domains & >> - amdgpu_mem_type_to_domain(mem_type))) >> - amdgpu_vm_bo_evicted(&bo_va->base); >> - else >> - amdgpu_vm_bo_idle(&bo_va->base); >> - } else { >> + if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) >> + amdgpu_vm_bo_idle(&bo_va->base); >> + else >> amdgpu_vm_bo_done(&bo_va->base); >> - } >> list_splice_init(&bo_va->invalids, &bo_va->valids); >> bo_va->cleared = clear; >> @@ -1883,6 +1877,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, >> spin_lock(&vm->status_lock); >> list_del(&bo_va->base.vm_status); >> + list_del(&bo_va->base.eviction_status); >> spin_unlock(&vm->status_lock); >> list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { >> @@ -1959,13 +1954,18 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, >> if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) >> bo = bo->parent; >> + /* If the BO is not in its preferred location add it back to >> + * the evicted list so that it gets validated again on the >> + * next command submission. >> + */ >> + uint32_t mem_type = bo->tbo.resource->mem_type; >> + bool suboptimal = !(bo->preferred_domains & >> + amdgpu_mem_type_to_domain(mem_type)); >> + >> for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { >> struct amdgpu_vm *vm = bo_base->vm; >> - if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >> - amdgpu_vm_bo_evicted(bo_base); >> - continue; >> - } >> + amdgpu_vm_bo_set_evicted(bo_base, suboptimal); >> if (bo_base->moved) >> continue; >> @@ -2648,13 +2648,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >> { >> struct amdgpu_bo_va *bo_va, *tmp; >> u64 total_idle = 0; >> - u64 total_evicted = 0; >> u64 total_relocated = 0; >> u64 total_moved = 0; >> u64 total_invalidated = 0; >> u64 total_done = 0; >> unsigned int total_idle_objs = 0; >> - unsigned int total_evicted_objs = 0; >> unsigned int total_relocated_objs = 0; >> unsigned int total_moved_objs = 0; >> unsigned int total_invalidated_objs = 0; >> @@ -2671,15 +2669,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >> total_idle_objs = id; >> id = 0; >> - seq_puts(m, "\tEvicted BOs:\n"); >> - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) { >> - if (!bo_va->base.bo) >> - continue; >> - total_evicted += amdgpu_bo_print_info(id++, bo_va->base.bo, m); >> - } >> - total_evicted_objs = id; >> - id = 0; >> - >> seq_puts(m, "\tRelocated BOs:\n"); >> list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) { >> if (!bo_va->base.bo) >> @@ -2718,8 +2707,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >> seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, >> total_idle_objs); >> - seq_printf(m, "\tTotal evicted size: %12lld\tobjs:\t%d\n", total_evicted, >> - total_evicted_objs); >> seq_printf(m, "\tTotal relocated size: %12lld\tobjs:\t%d\n", total_relocated, >> total_relocated_objs); >> seq_printf(m, "\tTotal moved size: %12lld\tobjs:\t%d\n", total_moved, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >> index 204ab13184ed..d9ab97eabda9 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >> @@ -156,6 +156,7 @@ struct amdgpu_vm_bo_base { >> /* protected by spinlock */ >> struct list_head vm_status; >> + struct list_head eviction_status; >> /* protected by the BO being reserved */ >> bool moved; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >> index 96d601e209b8..f78f4040f466 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >> @@ -652,6 +652,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) >> spin_lock(&entry->vm->status_lock); >> list_del(&entry->vm_status); >> + list_del(&entry->eviction_status); >> spin_unlock(&entry->vm->status_lock); >> amdgpu_bo_unref(&entry->bo); >> } >
Am 31.10.23 um 15:39 schrieb Tatsuyuki Ishi: > >> On Oct 31, 2023, at 22:55, Christian König <christian.koenig@amd.com> wrote: >> >> Am 31.10.23 um 14:40 schrieb Tatsuyuki Ishi: >>> In short, eviction never really belonged to the vm_status state machine. >> I strongly disagree to that. >> >>> Even when evicted, the BO could belong to either the moved or done state. >>> The "evicted" state needed to handle both cases, causing greater confusion. >>> >>> Additionally, there were inconsistencies in the definition of an evicted >>> BO. Some places are based on the `evict` parameter passed from the TTM move >>> callback, while the others were updated based on whether the BO got its >>> optimal placement. The second is more accurate for our use case. With this >>> refactor, the evicted state is solely determined by the second rule. >> That strongly sounds like you don't understand what the evicted state it good for. >> >> The evicted state is for page directories, page tables and per VM BOs which needs to move around before doing the next CS. >> >> Please further explain what you try to do here. > This is mainly an attempt to address inconsistency in the definition of “eviction”. The TTM move callback sets eviction when eviction happens through ttm_bo_evict. This is however not the only way a BO might end up outside its preferred domains. > > amdgpu_vm_bo_update later updates the eviction state based on whether the BO is in its preferred domains. In my understanding this includes all cases where the BO is evicted through ttm_bo_evict. Therefore we should apply this definition right from the move callback, not only after amdgpu_vm_bo_update has been called at least once. No, that is something completely separated. The evicted state just means that we need to re-validate the BO. One cause of this is that TTM moved the BO. But a different cause is that TTM moved the BO, we tried to validated it but fallen back to GTT for now and called amdgpu_vm_bo_update(). amdgpu_vm_bo_update() then moves the BO into the evicted state again so that we try to move it into VRAM on the next command submission. This is purely an optimization done to create enough pressure so that TTM can do it's work. Christian. > > Tatsuyuki. > >> Regards, >> Christian. >> >>> Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +++++++++-------------- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + >>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 1 + >>> 3 files changed, 29 insertions(+), 40 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> index 7b9762f1cddd..dd6f72e2a1d6 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> @@ -174,19 +174,23 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, >>> * State for PDs/PTs and per VM BOs which are not at the location they should >>> * be. >>> */ >>> -static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) >>> +static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted) >>> { >>> struct amdgpu_vm *vm = vm_bo->vm; >>> struct amdgpu_bo *bo = vm_bo->bo; >>> - vm_bo->moved = true; >>> spin_lock(&vm_bo->vm->status_lock); >>> - if (bo->tbo.type == ttm_bo_type_kernel) >>> - list_move(&vm_bo->vm_status, &vm->evicted); >>> - else >>> - list_move_tail(&vm_bo->vm_status, &vm->evicted); >>> + if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >>> + if (bo->tbo.type == ttm_bo_type_kernel) >>> + list_move(&vm_bo->eviction_status, &vm->evicted); >>> + else >>> + list_move_tail(&vm_bo->eviction_status, &vm->evicted); >>> + } else { >>> + list_del_init(&vm_bo->eviction_status); >>> + } >>> spin_unlock(&vm_bo->vm->status_lock); >>> } >>> + >>> /** >>> * amdgpu_vm_bo_moved - vm_bo is moved >>> * >>> @@ -310,6 +314,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, >>> base->bo = bo; >>> base->next = NULL; >>> INIT_LIST_HEAD(&base->vm_status); >>> + INIT_LIST_HEAD(&base->eviction_status); >>> if (!bo) >>> return; >>> @@ -336,7 +341,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, >>> * is currently evicted. add the bo to the evicted list to make sure it >>> * is validated on next vm use to avoid fault. >>> * */ >>> - amdgpu_vm_bo_evicted(base); >>> + amdgpu_vm_bo_set_evicted(base, true); >>> } >>> /** >>> @@ -460,7 +465,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, >>> while (!list_empty(&vm->evicted)) { >>> bo_base = list_first_entry(&vm->evicted, >>> struct amdgpu_vm_bo_base, >>> - vm_status); >>> + eviction_status); >>> spin_unlock(&vm->status_lock); >>> bo = bo_base->bo; >>> @@ -1034,7 +1039,7 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, >>> list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) >>> amdgpu_vm_bo_get_memory(bo_va, stats); >>> - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) >>> + list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.eviction_status) >>> amdgpu_vm_bo_get_memory(bo_va, stats); >>> list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) >>> @@ -1153,21 +1158,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, >>> return r; >>> } >>> - /* If the BO is not in its preferred location add it back to >>> - * the evicted list so that it gets validated again on the >>> - * next command submission. >>> - */ >>> - if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >>> - uint32_t mem_type = bo->tbo.resource->mem_type; >>> - >>> - if (!(bo->preferred_domains & >>> - amdgpu_mem_type_to_domain(mem_type))) >>> - amdgpu_vm_bo_evicted(&bo_va->base); >>> - else >>> - amdgpu_vm_bo_idle(&bo_va->base); >>> - } else { >>> + if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) >>> + amdgpu_vm_bo_idle(&bo_va->base); >>> + else >>> amdgpu_vm_bo_done(&bo_va->base); >>> - } >>> list_splice_init(&bo_va->invalids, &bo_va->valids); >>> bo_va->cleared = clear; >>> @@ -1883,6 +1877,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, >>> spin_lock(&vm->status_lock); >>> list_del(&bo_va->base.vm_status); >>> + list_del(&bo_va->base.eviction_status); >>> spin_unlock(&vm->status_lock); >>> list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { >>> @@ -1959,13 +1954,18 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, >>> if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) >>> bo = bo->parent; >>> + /* If the BO is not in its preferred location add it back to >>> + * the evicted list so that it gets validated again on the >>> + * next command submission. >>> + */ >>> + uint32_t mem_type = bo->tbo.resource->mem_type; >>> + bool suboptimal = !(bo->preferred_domains & >>> + amdgpu_mem_type_to_domain(mem_type)); >>> + >>> for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { >>> struct amdgpu_vm *vm = bo_base->vm; >>> - if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { >>> - amdgpu_vm_bo_evicted(bo_base); >>> - continue; >>> - } >>> + amdgpu_vm_bo_set_evicted(bo_base, suboptimal); >>> if (bo_base->moved) >>> continue; >>> @@ -2648,13 +2648,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >>> { >>> struct amdgpu_bo_va *bo_va, *tmp; >>> u64 total_idle = 0; >>> - u64 total_evicted = 0; >>> u64 total_relocated = 0; >>> u64 total_moved = 0; >>> u64 total_invalidated = 0; >>> u64 total_done = 0; >>> unsigned int total_idle_objs = 0; >>> - unsigned int total_evicted_objs = 0; >>> unsigned int total_relocated_objs = 0; >>> unsigned int total_moved_objs = 0; >>> unsigned int total_invalidated_objs = 0; >>> @@ -2671,15 +2669,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >>> total_idle_objs = id; >>> id = 0; >>> - seq_puts(m, "\tEvicted BOs:\n"); >>> - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) { >>> - if (!bo_va->base.bo) >>> - continue; >>> - total_evicted += amdgpu_bo_print_info(id++, bo_va->base.bo, m); >>> - } >>> - total_evicted_objs = id; >>> - id = 0; >>> - >>> seq_puts(m, "\tRelocated BOs:\n"); >>> list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) { >>> if (!bo_va->base.bo) >>> @@ -2718,8 +2707,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) >>> seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, >>> total_idle_objs); >>> - seq_printf(m, "\tTotal evicted size: %12lld\tobjs:\t%d\n", total_evicted, >>> - total_evicted_objs); >>> seq_printf(m, "\tTotal relocated size: %12lld\tobjs:\t%d\n", total_relocated, >>> total_relocated_objs); >>> seq_printf(m, "\tTotal moved size: %12lld\tobjs:\t%d\n", total_moved, >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >>> index 204ab13184ed..d9ab97eabda9 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h >>> @@ -156,6 +156,7 @@ struct amdgpu_vm_bo_base { >>> /* protected by spinlock */ >>> struct list_head vm_status; >>> + struct list_head eviction_status; >>> /* protected by the BO being reserved */ >>> bool moved; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >>> index 96d601e209b8..f78f4040f466 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c >>> @@ -652,6 +652,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) >>> spin_lock(&entry->vm->status_lock); >>> list_del(&entry->vm_status); >>> + list_del(&entry->eviction_status); >>> spin_unlock(&entry->vm->status_lock); >>> amdgpu_bo_unref(&entry->bo); >>> }
Hi Tatsuyuki, kernel test robot noticed the following build warnings: [auto build test WARNING on drm-misc/drm-misc-next] [also build test WARNING on drm/drm-next drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-tip/drm-tip linus/master v6.6 next-20231031] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Tatsuyuki-Ishi/drm-amdgpu-Don-t-implicit-sync-PRT-maps/20231031-224530 base: git://anongit.freedesktop.org/drm/drm-misc drm-misc-next patch link: https://lore.kernel.org/r/20231031134059.171277-3-ishitatsuyuki%40gmail.com patch subject: [PATCH 2/6] drm/amdgpu: Separate eviction from VM status. config: arc-randconfig-001-20231101 (https://download.01.org/0day-ci/archive/20231101/202311010709.XbwKjVaq-lkp@intel.com/config) compiler: arceb-elf-gcc (GCC) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231101/202311010709.XbwKjVaq-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202311010709.XbwKjVaq-lkp@intel.com/ All warnings (new ones prefixed by >>): >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:178: warning: Function parameter or member 'evicted' not described in 'amdgpu_vm_bo_set_evicted' >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c:178: warning: expecting prototype for amdgpu_vm_bo_evicted(). Prototype was for amdgpu_vm_bo_set_evicted() instead vim +178 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c dcb388eddb5f1b Nirmoy Das 2021-06-28 168 bcdc9fd634d1f0 Christian König 2018-08-30 169 /** bcdc9fd634d1f0 Christian König 2018-08-30 170 * amdgpu_vm_bo_evicted - vm_bo is evicted bcdc9fd634d1f0 Christian König 2018-08-30 171 * bcdc9fd634d1f0 Christian König 2018-08-30 172 * @vm_bo: vm_bo which is evicted bcdc9fd634d1f0 Christian König 2018-08-30 173 * bcdc9fd634d1f0 Christian König 2018-08-30 174 * State for PDs/PTs and per VM BOs which are not at the location they should bcdc9fd634d1f0 Christian König 2018-08-30 175 * be. bcdc9fd634d1f0 Christian König 2018-08-30 176 */ cac82290238e47 Tatsuyuki Ishi 2023-10-31 177 static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted) bcdc9fd634d1f0 Christian König 2018-08-30 @178 { bcdc9fd634d1f0 Christian König 2018-08-30 179 struct amdgpu_vm *vm = vm_bo->vm; bcdc9fd634d1f0 Christian König 2018-08-30 180 struct amdgpu_bo *bo = vm_bo->bo; bcdc9fd634d1f0 Christian König 2018-08-30 181 757eb2bedd08a1 Philip Yang 2022-09-15 182 spin_lock(&vm_bo->vm->status_lock); cac82290238e47 Tatsuyuki Ishi 2023-10-31 183 if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { bcdc9fd634d1f0 Christian König 2018-08-30 184 if (bo->tbo.type == ttm_bo_type_kernel) cac82290238e47 Tatsuyuki Ishi 2023-10-31 185 list_move(&vm_bo->eviction_status, &vm->evicted); bcdc9fd634d1f0 Christian König 2018-08-30 186 else cac82290238e47 Tatsuyuki Ishi 2023-10-31 187 list_move_tail(&vm_bo->eviction_status, &vm->evicted); cac82290238e47 Tatsuyuki Ishi 2023-10-31 188 } else { cac82290238e47 Tatsuyuki Ishi 2023-10-31 189 list_del_init(&vm_bo->eviction_status); cac82290238e47 Tatsuyuki Ishi 2023-10-31 190 } 757eb2bedd08a1 Philip Yang 2022-09-15 191 spin_unlock(&vm_bo->vm->status_lock); bcdc9fd634d1f0 Christian König 2018-08-30 192 } cac82290238e47 Tatsuyuki Ishi 2023-10-31 193
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 7b9762f1cddd..dd6f72e2a1d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -174,19 +174,23 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, * State for PDs/PTs and per VM BOs which are not at the location they should * be. */ -static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) +static void amdgpu_vm_bo_set_evicted(struct amdgpu_vm_bo_base *vm_bo, bool evicted) { struct amdgpu_vm *vm = vm_bo->vm; struct amdgpu_bo *bo = vm_bo->bo; - vm_bo->moved = true; spin_lock(&vm_bo->vm->status_lock); - if (bo->tbo.type == ttm_bo_type_kernel) - list_move(&vm_bo->vm_status, &vm->evicted); - else - list_move_tail(&vm_bo->vm_status, &vm->evicted); + if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { + if (bo->tbo.type == ttm_bo_type_kernel) + list_move(&vm_bo->eviction_status, &vm->evicted); + else + list_move_tail(&vm_bo->eviction_status, &vm->evicted); + } else { + list_del_init(&vm_bo->eviction_status); + } spin_unlock(&vm_bo->vm->status_lock); } + /** * amdgpu_vm_bo_moved - vm_bo is moved * @@ -310,6 +314,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, base->bo = bo; base->next = NULL; INIT_LIST_HEAD(&base->vm_status); + INIT_LIST_HEAD(&base->eviction_status); if (!bo) return; @@ -336,7 +341,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, * is currently evicted. add the bo to the evicted list to make sure it * is validated on next vm use to avoid fault. * */ - amdgpu_vm_bo_evicted(base); + amdgpu_vm_bo_set_evicted(base, true); } /** @@ -460,7 +465,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, while (!list_empty(&vm->evicted)) { bo_base = list_first_entry(&vm->evicted, struct amdgpu_vm_bo_base, - vm_status); + eviction_status); spin_unlock(&vm->status_lock); bo = bo_base->bo; @@ -1034,7 +1039,7 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) amdgpu_vm_bo_get_memory(bo_va, stats); - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) + list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.eviction_status) amdgpu_vm_bo_get_memory(bo_va, stats); list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) @@ -1153,21 +1158,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, return r; } - /* If the BO is not in its preferred location add it back to - * the evicted list so that it gets validated again on the - * next command submission. - */ - if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { - uint32_t mem_type = bo->tbo.resource->mem_type; - - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(mem_type))) - amdgpu_vm_bo_evicted(&bo_va->base); - else - amdgpu_vm_bo_idle(&bo_va->base); - } else { + if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) + amdgpu_vm_bo_idle(&bo_va->base); + else amdgpu_vm_bo_done(&bo_va->base); - } list_splice_init(&bo_va->invalids, &bo_va->valids); bo_va->cleared = clear; @@ -1883,6 +1877,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, spin_lock(&vm->status_lock); list_del(&bo_va->base.vm_status); + list_del(&bo_va->base.eviction_status); spin_unlock(&vm->status_lock); list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { @@ -1959,13 +1954,18 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) bo = bo->parent; + /* If the BO is not in its preferred location add it back to + * the evicted list so that it gets validated again on the + * next command submission. + */ + uint32_t mem_type = bo->tbo.resource->mem_type; + bool suboptimal = !(bo->preferred_domains & + amdgpu_mem_type_to_domain(mem_type)); + for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { struct amdgpu_vm *vm = bo_base->vm; - if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) { - amdgpu_vm_bo_evicted(bo_base); - continue; - } + amdgpu_vm_bo_set_evicted(bo_base, suboptimal); if (bo_base->moved) continue; @@ -2648,13 +2648,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) { struct amdgpu_bo_va *bo_va, *tmp; u64 total_idle = 0; - u64 total_evicted = 0; u64 total_relocated = 0; u64 total_moved = 0; u64 total_invalidated = 0; u64 total_done = 0; unsigned int total_idle_objs = 0; - unsigned int total_evicted_objs = 0; unsigned int total_relocated_objs = 0; unsigned int total_moved_objs = 0; unsigned int total_invalidated_objs = 0; @@ -2671,15 +2669,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) total_idle_objs = id; id = 0; - seq_puts(m, "\tEvicted BOs:\n"); - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) { - if (!bo_va->base.bo) - continue; - total_evicted += amdgpu_bo_print_info(id++, bo_va->base.bo, m); - } - total_evicted_objs = id; - id = 0; - seq_puts(m, "\tRelocated BOs:\n"); list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) { if (!bo_va->base.bo) @@ -2718,8 +2707,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, total_idle_objs); - seq_printf(m, "\tTotal evicted size: %12lld\tobjs:\t%d\n", total_evicted, - total_evicted_objs); seq_printf(m, "\tTotal relocated size: %12lld\tobjs:\t%d\n", total_relocated, total_relocated_objs); seq_printf(m, "\tTotal moved size: %12lld\tobjs:\t%d\n", total_moved, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 204ab13184ed..d9ab97eabda9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -156,6 +156,7 @@ struct amdgpu_vm_bo_base { /* protected by spinlock */ struct list_head vm_status; + struct list_head eviction_status; /* protected by the BO being reserved */ bool moved; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 96d601e209b8..f78f4040f466 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -652,6 +652,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) spin_lock(&entry->vm->status_lock); list_del(&entry->vm_status); + list_del(&entry->eviction_status); spin_unlock(&entry->vm->status_lock); amdgpu_bo_unref(&entry->bo); }
In short, eviction never really belonged to the vm_status state machine. Even when evicted, the BO could belong to either the moved or done state. The "evicted" state needed to handle both cases, causing greater confusion. Additionally, there were inconsistencies in the definition of an evicted BO. Some places are based on the `evict` parameter passed from the TTM move callback, while the others were updated based on whether the BO got its optimal placement. The second is more accurate for our use case. With this refactor, the evicted state is solely determined by the second rule. Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 1 + 3 files changed, 29 insertions(+), 40 deletions(-)