diff mbox series

[v5,6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.

Message ID 1555599624-12285-6-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series [v5,1/6] drm/amd/display: wait for fence without holding reservation lock | expand

Commit Message

Andrey Grodzovsky April 18, 2019, 3 p.m. UTC
Also reject TDRs if another one already running.

v2:
Stop all schedulers across device and entire XGMI hive before
force signaling HW fences.
Avoid passing job_signaled to helper fnctions to keep all the decision
making about skipping HW reset in one place.

v3:
Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
against it's decrement in drm_sched_stop in non HW reset case.
v4: rebase
v5: Revert v3 as we do it now in sceduler code.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
 1 file changed, 95 insertions(+), 48 deletions(-)

Comments

Andrey Grodzovsky April 22, 2019, 11:54 a.m. UTC | #1
Ping for patches 3, new patch 5 and patch 6.

Andrey

On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
> Also reject TDRs if another one already running.
>
> v2:
> Stop all schedulers across device and entire XGMI hive before
> force signaling HW fences.
> Avoid passing job_signaled to helper fnctions to keep all the decision
> making about skipping HW reset in one place.
>
> v3:
> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
> against it's decrement in drm_sched_stop in non HW reset case.
> v4: rebase
> v5: Revert v3 as we do it now in sceduler code.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>   1 file changed, 95 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a0e165c..85f8792 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> -		drm_sched_stop(&ring->sched, &job->base);
> -
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	if(job)
>   		drm_sched_increase_karma(&job->base);
>   
> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
>   
>   		if (!need_full_reset)
> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	return r;
>   }
>   
> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>   {
> -	int i;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		if (!adev->asic_reset_res)
> -			drm_sched_resubmit_jobs(&ring->sched);
> +	if (trylock) {
> +		if (!mutex_trylock(&adev->lock_reset))
> +			return false;
> +	} else
> +		mutex_lock(&adev->lock_reset);
>   
> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
> -	}
> -
> -	if (!amdgpu_device_has_dc_support(adev)) {
> -		drm_helper_resume_force_mode(adev->ddev);
> -	}
> -
> -	adev->asic_reset_res = 0;
> -}
> -
> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
> -{
> -	mutex_lock(&adev->lock_reset);
>   	atomic_inc(&adev->gpu_reset_counter);
>   	adev->in_gpu_reset = 1;
>   	/* Block kfd: SRIOV would do it separately */
>   	if (!amdgpu_sriov_vf(adev))
>                   amdgpu_amdkfd_pre_reset(adev);
> +
> +	return true;
>   }
>   
>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			      struct amdgpu_job *job)
>   {
> -	int r;
> +	struct list_head device_list, *device_list_handle =  NULL;
> +	bool need_full_reset, job_signaled;
>   	struct amdgpu_hive_info *hive = NULL;
> -	bool need_full_reset = false;
>   	struct amdgpu_device *tmp_adev = NULL;
> -	struct list_head device_list, *device_list_handle =  NULL;
> +	int i, r = 0;
>   
> +	need_full_reset = job_signaled = false;
>   	INIT_LIST_HEAD(&device_list);
>   
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
> +	hive = amdgpu_get_xgmi_hive(adev, false);
> +
>   	/*
> -	 * In case of XGMI hive disallow concurrent resets to be triggered
> -	 * by different nodes. No point also since the one node already executing
> -	 * reset will also reset all the other nodes in the hive.
> +	 * Here we trylock to avoid chain of resets executing from
> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
> +	 * different schedulers for same device while this TO handler is running.
> +	 * We always reset all schedulers for device and all devices for XGMI
> +	 * hive so that should take care of them too.
>   	 */
> -	hive = amdgpu_get_xgmi_hive(adev, 0);
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
> -	    !mutex_trylock(&hive->reset_lock))
> +
> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> +			 job->base.id, hive->hive_id);
>   		return 0;
> +	}
>   
>   	/* Start with adev pre asic reset first for soft reset check.*/
> -	amdgpu_device_lock_adev(adev);
> -	r = amdgpu_device_pre_asic_reset(adev,
> -					 job,
> -					 &need_full_reset);
> -	if (r) {
> -		/*TODO Should we stop ?*/
> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> -			  r, adev->ddev->unique);
> -		adev->asic_reset_res = r;
> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
> +					 job->base.id);
> +		return 0;
>   	}
>   
>   	/* Build list of devices to reset */
> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>   		if (!hive) {
>   			amdgpu_device_unlock_adev(adev);
>   			return -ENODEV;
> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		device_list_handle = &device_list;
>   	}
>   
> +	/* block all schedulers and reset given job's ring */
> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			drm_sched_stop(&ring->sched, &job->base);
> +		}
> +	}
> +
> +
> +	/*
> +	 * Must check guilty signal here since after this point all old
> +	 * HW fences are force signaled.
> +	 *
> +	 * job->base holds a reference to parent fence
> +	 */
> +	if (job && job->base.s_fence->parent &&
> +	    dma_fence_is_signaled(job->base.s_fence->parent))
> +		job_signaled = true;
> +
> +	if (!amdgpu_device_ip_need_full_reset(adev))
> +		device_list_handle = &device_list;
> +
> +	if (job_signaled) {
> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
> +		goto skip_hw_reset;
> +	}
> +
> +
> +	/* Guilty job will be freed after this*/
> +	r = amdgpu_device_pre_asic_reset(adev,
> +					 job,
> +					 &need_full_reset);
> +	if (r) {
> +		/*TODO Should we stop ?*/
> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> +			  r, adev->ddev->unique);
> +		adev->asic_reset_res = r;
> +	}
> +
>   retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   
>   		if (tmp_adev == adev)
>   			continue;
>   
> -		amdgpu_device_lock_adev(tmp_adev);
> +		amdgpu_device_lock_adev(tmp_adev, false);
>   		r = amdgpu_device_pre_asic_reset(tmp_adev,
>   						 NULL,
>   						 &need_full_reset);
> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			goto retry;
>   	}
>   
> +skip_hw_reset:
> +
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		amdgpu_device_post_asic_reset(tmp_adev);
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			/* No point to resubmit jobs if we didn't HW reset*/
> +			if (!tmp_adev->asic_reset_res && !job_signaled)
> +				drm_sched_resubmit_jobs(&ring->sched);
> +
> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +		}
> +
> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
> +			drm_helper_resume_force_mode(tmp_adev->ddev);
> +		}
> +
> +		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
>   			/* bad news, how to tell it to userspace ? */
> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_device_unlock_adev(tmp_adev);
>   	}
>   
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> +	if (hive)
>   		mutex_unlock(&hive->reset_lock);
>   
>   	if (r)
Chunming Zhou April 22, 2019, 1:09 p.m. UTC | #2
+Monk.

GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.

But out of curious, why guilty job can signal more if the job is already 
set to guilty? set it wrongly?


-David

在 2019/4/18 23:00, Andrey Grodzovsky 写道:
> Also reject TDRs if another one already running.
>
> v2:
> Stop all schedulers across device and entire XGMI hive before
> force signaling HW fences.
> Avoid passing job_signaled to helper fnctions to keep all the decision
> making about skipping HW reset in one place.
>
> v3:
> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
> against it's decrement in drm_sched_stop in non HW reset case.
> v4: rebase
> v5: Revert v3 as we do it now in sceduler code.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>   1 file changed, 95 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a0e165c..85f8792 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> -		drm_sched_stop(&ring->sched, &job->base);
> -
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	if(job)
>   		drm_sched_increase_karma(&job->base);
>   
> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
>   
>   		if (!need_full_reset)
> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	return r;
>   }
>   
> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>   {
> -	int i;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		if (!adev->asic_reset_res)
> -			drm_sched_resubmit_jobs(&ring->sched);
> +	if (trylock) {
> +		if (!mutex_trylock(&adev->lock_reset))
> +			return false;
> +	} else
> +		mutex_lock(&adev->lock_reset);
>   
> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
> -	}
> -
> -	if (!amdgpu_device_has_dc_support(adev)) {
> -		drm_helper_resume_force_mode(adev->ddev);
> -	}
> -
> -	adev->asic_reset_res = 0;
> -}
> -
> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
> -{
> -	mutex_lock(&adev->lock_reset);
>   	atomic_inc(&adev->gpu_reset_counter);
>   	adev->in_gpu_reset = 1;
>   	/* Block kfd: SRIOV would do it separately */
>   	if (!amdgpu_sriov_vf(adev))
>                   amdgpu_amdkfd_pre_reset(adev);
> +
> +	return true;
>   }
>   
>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			      struct amdgpu_job *job)
>   {
> -	int r;
> +	struct list_head device_list, *device_list_handle =  NULL;
> +	bool need_full_reset, job_signaled;
>   	struct amdgpu_hive_info *hive = NULL;
> -	bool need_full_reset = false;
>   	struct amdgpu_device *tmp_adev = NULL;
> -	struct list_head device_list, *device_list_handle =  NULL;
> +	int i, r = 0;
>   
> +	need_full_reset = job_signaled = false;
>   	INIT_LIST_HEAD(&device_list);
>   
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
> +	hive = amdgpu_get_xgmi_hive(adev, false);
> +
>   	/*
> -	 * In case of XGMI hive disallow concurrent resets to be triggered
> -	 * by different nodes. No point also since the one node already executing
> -	 * reset will also reset all the other nodes in the hive.
> +	 * Here we trylock to avoid chain of resets executing from
> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
> +	 * different schedulers for same device while this TO handler is running.
> +	 * We always reset all schedulers for device and all devices for XGMI
> +	 * hive so that should take care of them too.
>   	 */
> -	hive = amdgpu_get_xgmi_hive(adev, 0);
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
> -	    !mutex_trylock(&hive->reset_lock))
> +
> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> +			 job->base.id, hive->hive_id);
>   		return 0;
> +	}
>   
>   	/* Start with adev pre asic reset first for soft reset check.*/
> -	amdgpu_device_lock_adev(adev);
> -	r = amdgpu_device_pre_asic_reset(adev,
> -					 job,
> -					 &need_full_reset);
> -	if (r) {
> -		/*TODO Should we stop ?*/
> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> -			  r, adev->ddev->unique);
> -		adev->asic_reset_res = r;
> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
> +					 job->base.id);
> +		return 0;
>   	}
>   
>   	/* Build list of devices to reset */
> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>   		if (!hive) {
>   			amdgpu_device_unlock_adev(adev);
>   			return -ENODEV;
> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		device_list_handle = &device_list;
>   	}
>   
> +	/* block all schedulers and reset given job's ring */
> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			drm_sched_stop(&ring->sched, &job->base);
> +		}
> +	}
> +
> +
> +	/*
> +	 * Must check guilty signal here since after this point all old
> +	 * HW fences are force signaled.
> +	 *
> +	 * job->base holds a reference to parent fence
> +	 */
> +	if (job && job->base.s_fence->parent &&
> +	    dma_fence_is_signaled(job->base.s_fence->parent))
> +		job_signaled = true;
> +
> +	if (!amdgpu_device_ip_need_full_reset(adev))
> +		device_list_handle = &device_list;
> +
> +	if (job_signaled) {
> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
> +		goto skip_hw_reset;
> +	}
> +
> +
> +	/* Guilty job will be freed after this*/
> +	r = amdgpu_device_pre_asic_reset(adev,
> +					 job,
> +					 &need_full_reset);
> +	if (r) {
> +		/*TODO Should we stop ?*/
> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
> +			  r, adev->ddev->unique);
> +		adev->asic_reset_res = r;
> +	}
> +
>   retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   
>   		if (tmp_adev == adev)
>   			continue;
>   
> -		amdgpu_device_lock_adev(tmp_adev);
> +		amdgpu_device_lock_adev(tmp_adev, false);
>   		r = amdgpu_device_pre_asic_reset(tmp_adev,
>   						 NULL,
>   						 &need_full_reset);
> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			goto retry;
>   	}
>   
> +skip_hw_reset:
> +
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		amdgpu_device_post_asic_reset(tmp_adev);
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			/* No point to resubmit jobs if we didn't HW reset*/
> +			if (!tmp_adev->asic_reset_res && !job_signaled)
> +				drm_sched_resubmit_jobs(&ring->sched);
> +
> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +		}
> +
> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
> +			drm_helper_resume_force_mode(tmp_adev->ddev);
> +		}
> +
> +		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
>   			/* bad news, how to tell it to userspace ? */
> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_device_unlock_adev(tmp_adev);
>   	}
>   
> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> +	if (hive)
>   		mutex_unlock(&hive->reset_lock);
>   
>   	if (r)
Christian König April 23, 2019, 12:32 p.m. UTC | #3
Well you at least have to give me time till after the holidays to get 
going again :)

Not sure exactly jet why we need patch number 5.

And we should probably commit patch #1 and #2.

Christian.

Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
> Ping for patches 3, new patch 5 and patch 6.
>
> Andrey
>
> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    		if (!ring || !ring->sched.thread)
>>    			continue;
>>    
>> -		drm_sched_stop(&ring->sched, &job->base);
>> -
>>    		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>    		amdgpu_fence_driver_force_completion(ring);
>>    	}
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	if(job)
>>    		drm_sched_increase_karma(&job->base);
>>    
>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>    	if (!amdgpu_sriov_vf(adev)) {
>>    
>>    		if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    	return r;
>>    }
>>    
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -	int i;
>> -
>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -		struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -		if (!ring || !ring->sched.thread)
>> -			continue;
>> -
>> -		if (!adev->asic_reset_res)
>> -			drm_sched_resubmit_jobs(&ring->sched);
>> +	if (trylock) {
>> +		if (!mutex_trylock(&adev->lock_reset))
>> +			return false;
>> +	} else
>> +		mutex_lock(&adev->lock_reset);
>>    
>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -	}
>> -
>> -	if (!amdgpu_device_has_dc_support(adev)) {
>> -		drm_helper_resume_force_mode(adev->ddev);
>> -	}
>> -
>> -	adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -	mutex_lock(&adev->lock_reset);
>>    	atomic_inc(&adev->gpu_reset_counter);
>>    	adev->in_gpu_reset = 1;
>>    	/* Block kfd: SRIOV would do it separately */
>>    	if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +	return true;
>>    }
>>    
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    			      struct amdgpu_job *job)
>>    {
>> -	int r;
>> +	struct list_head device_list, *device_list_handle =  NULL;
>> +	bool need_full_reset, job_signaled;
>>    	struct amdgpu_hive_info *hive = NULL;
>> -	bool need_full_reset = false;
>>    	struct amdgpu_device *tmp_adev = NULL;
>> -	struct list_head device_list, *device_list_handle =  NULL;
>> +	int i, r = 0;
>>    
>> +	need_full_reset = job_signaled = false;
>>    	INIT_LIST_HEAD(&device_list);
>>    
>>    	dev_info(adev->dev, "GPU reset begin!\n");
>>    
>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>    	/*
>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>> -	 * by different nodes. No point also since the one node already executing
>> -	 * reset will also reset all the other nodes in the hive.
>> +	 * Here we trylock to avoid chain of resets executing from
>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +	 * different schedulers for same device while this TO handler is running.
>> +	 * We always reset all schedulers for device and all devices for XGMI
>> +	 * hive so that should take care of them too.
>>    	 */
>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -	    !mutex_trylock(&hive->reset_lock))
>> +
>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +			 job->base.id, hive->hive_id);
>>    		return 0;
>> +	}
>>    
>>    	/* Start with adev pre asic reset first for soft reset check.*/
>> -	amdgpu_device_lock_adev(adev);
>> -	r = amdgpu_device_pre_asic_reset(adev,
>> -					 job,
>> -					 &need_full_reset);
>> -	if (r) {
>> -		/*TODO Should we stop ?*/
>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -			  r, adev->ddev->unique);
>> -		adev->asic_reset_res = r;
>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +					 job->base.id);
>> +		return 0;
>>    	}
>>    
>>    	/* Build list of devices to reset */
>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>    		if (!hive) {
>>    			amdgpu_device_unlock_adev(adev);
>>    			return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		device_list_handle = &device_list;
>>    	}
>>    
>> +	/* block all schedulers and reset given job's ring */
>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +			if (!ring || !ring->sched.thread)
>> +				continue;
>> +
>> +			drm_sched_stop(&ring->sched, &job->base);
>> +		}
>> +	}
>> +
>> +
>> +	/*
>> +	 * Must check guilty signal here since after this point all old
>> +	 * HW fences are force signaled.
>> +	 *
>> +	 * job->base holds a reference to parent fence
>> +	 */
>> +	if (job && job->base.s_fence->parent &&
>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>> +		job_signaled = true;
>> +
>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>> +		device_list_handle = &device_list;
>> +
>> +	if (job_signaled) {
>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +		goto skip_hw_reset;
>> +	}
>> +
>> +
>> +	/* Guilty job will be freed after this*/
>> +	r = amdgpu_device_pre_asic_reset(adev,
>> +					 job,
>> +					 &need_full_reset);
>> +	if (r) {
>> +		/*TODO Should we stop ?*/
>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +			  r, adev->ddev->unique);
>> +		adev->asic_reset_res = r;
>> +	}
>> +
>>    retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>    
>>    		if (tmp_adev == adev)
>>    			continue;
>>    
>> -		amdgpu_device_lock_adev(tmp_adev);
>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>    		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>    						 NULL,
>>    						 &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    			goto retry;
>>    	}
>>    
>> +skip_hw_reset:
>> +
>>    	/* Post ASIC reset for all devs .*/
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -		amdgpu_device_post_asic_reset(tmp_adev);
>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +			if (!ring || !ring->sched.thread)
>> +				continue;
>> +
>> +			/* No point to resubmit jobs if we didn't HW reset*/
>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>> +				drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +		}
>> +
>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>> +		}
>> +
>> +		tmp_adev->asic_reset_res = 0;
>>    
>>    		if (r) {
>>    			/* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		amdgpu_device_unlock_adev(tmp_adev);
>>    	}
>>    
>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +	if (hive)
>>    		mutex_unlock(&hive->reset_lock);
>>    
>>    	if (r)
Kazlauskas, Nicholas April 23, 2019, 1:14 p.m. UTC | #4
Feel free to merge 1+2 since they don't really depend on any other work 
in the series and they were previously reviewed.

Nicholas Kazlauskas

On 4/23/19 8:32 AM, Koenig, Christian wrote:
> Well you at least have to give me time till after the holidays to get
> going again :)
> 
> Not sure exactly jet why we need patch number 5.
> 
> And we should probably commit patch #1 and #2.
> 
> Christian.
> 
> Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
>> Ping for patches 3, new patch 5 and patch 6.
>>
>> Andrey
>>
>> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>>> Also reject TDRs if another one already running.
>>>
>>> v2:
>>> Stop all schedulers across device and entire XGMI hive before
>>> force signaling HW fences.
>>> Avoid passing job_signaled to helper fnctions to keep all the decision
>>> making about skipping HW reset in one place.
>>>
>>> v3:
>>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>>> against it's decrement in drm_sched_stop in non HW reset case.
>>> v4: rebase
>>> v5: Revert v3 as we do it now in sceduler code.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>>     1 file changed, 95 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index a0e165c..85f8792 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     		if (!ring || !ring->sched.thread)
>>>     			continue;
>>>     
>>> -		drm_sched_stop(&ring->sched, &job->base);
>>> -
>>>     		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>>     		amdgpu_fence_driver_force_completion(ring);
>>>     	}
>>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     	if(job)
>>>     		drm_sched_increase_karma(&job->base);
>>>     
>>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     	if (!amdgpu_sriov_vf(adev)) {
>>>     
>>>     		if (!need_full_reset)
>>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     	return r;
>>>     }
>>>     
>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>>     {
>>> -	int i;
>>> -
>>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> -		struct amdgpu_ring *ring = adev->rings[i];
>>> -
>>> -		if (!ring || !ring->sched.thread)
>>> -			continue;
>>> -
>>> -		if (!adev->asic_reset_res)
>>> -			drm_sched_resubmit_jobs(&ring->sched);
>>> +	if (trylock) {
>>> +		if (!mutex_trylock(&adev->lock_reset))
>>> +			return false;
>>> +	} else
>>> +		mutex_lock(&adev->lock_reset);
>>>     
>>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>> -	}
>>> -
>>> -	if (!amdgpu_device_has_dc_support(adev)) {
>>> -		drm_helper_resume_force_mode(adev->ddev);
>>> -	}
>>> -
>>> -	adev->asic_reset_res = 0;
>>> -}
>>> -
>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>> -{
>>> -	mutex_lock(&adev->lock_reset);
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>>     	adev->in_gpu_reset = 1;
>>>     	/* Block kfd: SRIOV would do it separately */
>>>     	if (!amdgpu_sriov_vf(adev))
>>>                     amdgpu_amdkfd_pre_reset(adev);
>>> +
>>> +	return true;
>>>     }
>>>     
>>>     static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>     int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			      struct amdgpu_job *job)
>>>     {
>>> -	int r;
>>> +	struct list_head device_list, *device_list_handle =  NULL;
>>> +	bool need_full_reset, job_signaled;
>>>     	struct amdgpu_hive_info *hive = NULL;
>>> -	bool need_full_reset = false;
>>>     	struct amdgpu_device *tmp_adev = NULL;
>>> -	struct list_head device_list, *device_list_handle =  NULL;
>>> +	int i, r = 0;
>>>     
>>> +	need_full_reset = job_signaled = false;
>>>     	INIT_LIST_HEAD(&device_list);
>>>     
>>>     	dev_info(adev->dev, "GPU reset begin!\n");
>>>     
>>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>>> +
>>>     	/*
>>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>>> -	 * by different nodes. No point also since the one node already executing
>>> -	 * reset will also reset all the other nodes in the hive.
>>> +	 * Here we trylock to avoid chain of resets executing from
>>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>>> +	 * different schedulers for same device while this TO handler is running.
>>> +	 * We always reset all schedulers for device and all devices for XGMI
>>> +	 * hive so that should take care of them too.
>>>     	 */
>>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>> -	    !mutex_trylock(&hive->reset_lock))
>>> +
>>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>>> +			 job->base.id, hive->hive_id);
>>>     		return 0;
>>> +	}
>>>     
>>>     	/* Start with adev pre asic reset first for soft reset check.*/
>>> -	amdgpu_device_lock_adev(adev);
>>> -	r = amdgpu_device_pre_asic_reset(adev,
>>> -					 job,
>>> -					 &need_full_reset);
>>> -	if (r) {
>>> -		/*TODO Should we stop ?*/
>>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> -			  r, adev->ddev->unique);
>>> -		adev->asic_reset_res = r;
>>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>> +					 job->base.id);
>>> +		return 0;
>>>     	}
>>>     
>>>     	/* Build list of devices to reset */
>>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>     		if (!hive) {
>>>     			amdgpu_device_unlock_adev(adev);
>>>     			return -ENODEV;
>>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		device_list_handle = &device_list;
>>>     	}
>>>     
>>> +	/* block all schedulers and reset given job's ring */
>>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			drm_sched_stop(&ring->sched, &job->base);
>>> +		}
>>> +	}
>>> +
>>> +
>>> +	/*
>>> +	 * Must check guilty signal here since after this point all old
>>> +	 * HW fences are force signaled.
>>> +	 *
>>> +	 * job->base holds a reference to parent fence
>>> +	 */
>>> +	if (job && job->base.s_fence->parent &&
>>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>>> +		job_signaled = true;
>>> +
>>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>>> +		device_list_handle = &device_list;
>>> +
>>> +	if (job_signaled) {
>>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>> +		goto skip_hw_reset;
>>> +	}
>>> +
>>> +
>>> +	/* Guilty job will be freed after this*/
>>> +	r = amdgpu_device_pre_asic_reset(adev,
>>> +					 job,
>>> +					 &need_full_reset);
>>> +	if (r) {
>>> +		/*TODO Should we stop ?*/
>>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> +			  r, adev->ddev->unique);
>>> +		adev->asic_reset_res = r;
>>> +	}
>>> +
>>>     retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>     
>>>     		if (tmp_adev == adev)
>>>     			continue;
>>>     
>>> -		amdgpu_device_lock_adev(tmp_adev);
>>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>>     		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>     						 NULL,
>>>     						 &need_full_reset);
>>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			goto retry;
>>>     	}
>>>     
>>> +skip_hw_reset:
>>> +
>>>     	/* Post ASIC reset for all devs .*/
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> -		amdgpu_device_post_asic_reset(tmp_adev);
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			/* No point to resubmit jobs if we didn't HW reset*/
>>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +				drm_sched_resubmit_jobs(&ring->sched);
>>> +
>>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +		}
>>> +
>>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>>> +		}
>>> +
>>> +		tmp_adev->asic_reset_res = 0;
>>>     
>>>     		if (r) {
>>>     			/* bad news, how to tell it to userspace ? */
>>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		amdgpu_device_unlock_adev(tmp_adev);
>>>     	}
>>>     
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>> +	if (hive)
>>>     		mutex_unlock(&hive->reset_lock);
>>>     
>>>     	if (r)
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
Andrey Grodzovsky April 23, 2019, 2:03 p.m. UTC | #5
OK, i will merge them into amd-staging drm-next.

Andrey

On 4/23/19 9:14 AM, Kazlauskas, Nicholas wrote:
> Feel free to merge 1+2 since they don't really depend on any other work
> in the series and they were previously reviewed.
>
> Nicholas Kazlauskas
>
> On 4/23/19 8:32 AM, Koenig, Christian wrote:
>> Well you at least have to give me time till after the holidays to get
>> going again :)
>>
>> Not sure exactly jet why we need patch number 5.
>>
>> And we should probably commit patch #1 and #2.
>>
>> Christian.
>>
>> Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
>>> Ping for patches 3, new patch 5 and patch 6.
>>>
>>> Andrey
>>>
>>> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>>>> Also reject TDRs if another one already running.
>>>>
>>>> v2:
>>>> Stop all schedulers across device and entire XGMI hive before
>>>> force signaling HW fences.
>>>> Avoid passing job_signaled to helper fnctions to keep all the decision
>>>> making about skipping HW reset in one place.
>>>>
>>>> v3:
>>>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>>>> against it's decrement in drm_sched_stop in non HW reset case.
>>>> v4: rebase
>>>> v5: Revert v3 as we do it now in sceduler code.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>>>      1 file changed, 95 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index a0e165c..85f8792 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      		if (!ring || !ring->sched.thread)
>>>>      			continue;
>>>>      
>>>> -		drm_sched_stop(&ring->sched, &job->base);
>>>> -
>>>>      		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>>>      		amdgpu_fence_driver_force_completion(ring);
>>>>      	}
>>>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      	if(job)
>>>>      		drm_sched_increase_karma(&job->base);
>>>>      
>>>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>>      	if (!amdgpu_sriov_vf(adev)) {
>>>>      
>>>>      		if (!need_full_reset)
>>>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      	return r;
>>>>      }
>>>>      
>>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>>>      {
>>>> -	int i;
>>>> -
>>>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> -		struct amdgpu_ring *ring = adev->rings[i];
>>>> -
>>>> -		if (!ring || !ring->sched.thread)
>>>> -			continue;
>>>> -
>>>> -		if (!adev->asic_reset_res)
>>>> -			drm_sched_resubmit_jobs(&ring->sched);
>>>> +	if (trylock) {
>>>> +		if (!mutex_trylock(&adev->lock_reset))
>>>> +			return false;
>>>> +	} else
>>>> +		mutex_lock(&adev->lock_reset);
>>>>      
>>>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>>> -	}
>>>> -
>>>> -	if (!amdgpu_device_has_dc_support(adev)) {
>>>> -		drm_helper_resume_force_mode(adev->ddev);
>>>> -	}
>>>> -
>>>> -	adev->asic_reset_res = 0;
>>>> -}
>>>> -
>>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>>> -{
>>>> -	mutex_lock(&adev->lock_reset);
>>>>      	atomic_inc(&adev->gpu_reset_counter);
>>>>      	adev->in_gpu_reset = 1;
>>>>      	/* Block kfd: SRIOV would do it separately */
>>>>      	if (!amdgpu_sriov_vf(adev))
>>>>                      amdgpu_amdkfd_pre_reset(adev);
>>>> +
>>>> +	return true;
>>>>      }
>>>>      
>>>>      static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>>      int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      			      struct amdgpu_job *job)
>>>>      {
>>>> -	int r;
>>>> +	struct list_head device_list, *device_list_handle =  NULL;
>>>> +	bool need_full_reset, job_signaled;
>>>>      	struct amdgpu_hive_info *hive = NULL;
>>>> -	bool need_full_reset = false;
>>>>      	struct amdgpu_device *tmp_adev = NULL;
>>>> -	struct list_head device_list, *device_list_handle =  NULL;
>>>> +	int i, r = 0;
>>>>      
>>>> +	need_full_reset = job_signaled = false;
>>>>      	INIT_LIST_HEAD(&device_list);
>>>>      
>>>>      	dev_info(adev->dev, "GPU reset begin!\n");
>>>>      
>>>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>>>> +
>>>>      	/*
>>>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>>>> -	 * by different nodes. No point also since the one node already executing
>>>> -	 * reset will also reset all the other nodes in the hive.
>>>> +	 * Here we trylock to avoid chain of resets executing from
>>>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>>>> +	 * different schedulers for same device while this TO handler is running.
>>>> +	 * We always reset all schedulers for device and all devices for XGMI
>>>> +	 * hive so that should take care of them too.
>>>>      	 */
>>>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>> -	    !mutex_trylock(&hive->reset_lock))
>>>> +
>>>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>>>> +			 job->base.id, hive->hive_id);
>>>>      		return 0;
>>>> +	}
>>>>      
>>>>      	/* Start with adev pre asic reset first for soft reset check.*/
>>>> -	amdgpu_device_lock_adev(adev);
>>>> -	r = amdgpu_device_pre_asic_reset(adev,
>>>> -					 job,
>>>> -					 &need_full_reset);
>>>> -	if (r) {
>>>> -		/*TODO Should we stop ?*/
>>>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> -			  r, adev->ddev->unique);
>>>> -		adev->asic_reset_res = r;
>>>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>>> +					 job->base.id);
>>>> +		return 0;
>>>>      	}
>>>>      
>>>>      	/* Build list of devices to reset */
>>>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>>>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>>      		if (!hive) {
>>>>      			amdgpu_device_unlock_adev(adev);
>>>>      			return -ENODEV;
>>>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      		device_list_handle = &device_list;
>>>>      	}
>>>>      
>>>> +	/* block all schedulers and reset given job's ring */
>>>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>> +
>>>> +			if (!ring || !ring->sched.thread)
>>>> +				continue;
>>>> +
>>>> +			drm_sched_stop(&ring->sched, &job->base);
>>>> +		}
>>>> +	}
>>>> +
>>>> +
>>>> +	/*
>>>> +	 * Must check guilty signal here since after this point all old
>>>> +	 * HW fences are force signaled.
>>>> +	 *
>>>> +	 * job->base holds a reference to parent fence
>>>> +	 */
>>>> +	if (job && job->base.s_fence->parent &&
>>>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>> +		job_signaled = true;
>>>> +
>>>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>>>> +		device_list_handle = &device_list;
>>>> +
>>>> +	if (job_signaled) {
>>>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>>> +		goto skip_hw_reset;
>>>> +	}
>>>> +
>>>> +
>>>> +	/* Guilty job will be freed after this*/
>>>> +	r = amdgpu_device_pre_asic_reset(adev,
>>>> +					 job,
>>>> +					 &need_full_reset);
>>>> +	if (r) {
>>>> +		/*TODO Should we stop ?*/
>>>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> +			  r, adev->ddev->unique);
>>>> +		adev->asic_reset_res = r;
>>>> +	}
>>>> +
>>>>      retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>>      
>>>>      		if (tmp_adev == adev)
>>>>      			continue;
>>>>      
>>>> -		amdgpu_device_lock_adev(tmp_adev);
>>>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>>>      		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>>      						 NULL,
>>>>      						 &need_full_reset);
>>>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      			goto retry;
>>>>      	}
>>>>      
>>>> +skip_hw_reset:
>>>> +
>>>>      	/* Post ASIC reset for all devs .*/
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>> -		amdgpu_device_post_asic_reset(tmp_adev);
>>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>> +
>>>> +			if (!ring || !ring->sched.thread)
>>>> +				continue;
>>>> +
>>>> +			/* No point to resubmit jobs if we didn't HW reset*/
>>>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>>>> +				drm_sched_resubmit_jobs(&ring->sched);
>>>> +
>>>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>> +		}
>>>> +
>>>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>>>> +		}
>>>> +
>>>> +		tmp_adev->asic_reset_res = 0;
>>>>      
>>>>      		if (r) {
>>>>      			/* bad news, how to tell it to userspace ? */
>>>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      		amdgpu_device_unlock_adev(tmp_adev);
>>>>      	}
>>>>      
>>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>>> +	if (hive)
>>>>      		mutex_unlock(&hive->reset_lock);
>>>>      
>>>>      	if (r)
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
Andrey Grodzovsky April 23, 2019, 2:12 p.m. UTC | #6
On 4/23/19 8:32 AM, Koenig, Christian wrote:

> Well you at least have to give me time till after the holidays to get
> going again :)
>
> Not sure exactly jet why we need patch number 5.

Probably you missed the mail where I pointed out a bug I found during 
testing - I am  reattaching the mail and the KASAN dump.

Andrey


>
> And we should probably commit patch #1 and #2.
>
> Christian.
>
> Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
>> Ping for patches 3, new patch 5 and patch 6.
>>
>> Andrey
>>
>> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>>> Also reject TDRs if another one already running.
>>>
>>> v2:
>>> Stop all schedulers across device and entire XGMI hive before
>>> force signaling HW fences.
>>> Avoid passing job_signaled to helper fnctions to keep all the decision
>>> making about skipping HW reset in one place.
>>>
>>> v3:
>>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>>> against it's decrement in drm_sched_stop in non HW reset case.
>>> v4: rebase
>>> v5: Revert v3 as we do it now in sceduler code.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>>     1 file changed, 95 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index a0e165c..85f8792 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     		if (!ring || !ring->sched.thread)
>>>     			continue;
>>>     
>>> -		drm_sched_stop(&ring->sched, &job->base);
>>> -
>>>     		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>>     		amdgpu_fence_driver_force_completion(ring);
>>>     	}
>>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     	if(job)
>>>     		drm_sched_increase_karma(&job->base);
>>>     
>>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     	if (!amdgpu_sriov_vf(adev)) {
>>>     
>>>     		if (!need_full_reset)
>>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     	return r;
>>>     }
>>>     
>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>>     {
>>> -	int i;
>>> -
>>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> -		struct amdgpu_ring *ring = adev->rings[i];
>>> -
>>> -		if (!ring || !ring->sched.thread)
>>> -			continue;
>>> -
>>> -		if (!adev->asic_reset_res)
>>> -			drm_sched_resubmit_jobs(&ring->sched);
>>> +	if (trylock) {
>>> +		if (!mutex_trylock(&adev->lock_reset))
>>> +			return false;
>>> +	} else
>>> +		mutex_lock(&adev->lock_reset);
>>>     
>>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>> -	}
>>> -
>>> -	if (!amdgpu_device_has_dc_support(adev)) {
>>> -		drm_helper_resume_force_mode(adev->ddev);
>>> -	}
>>> -
>>> -	adev->asic_reset_res = 0;
>>> -}
>>> -
>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>> -{
>>> -	mutex_lock(&adev->lock_reset);
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>>     	adev->in_gpu_reset = 1;
>>>     	/* Block kfd: SRIOV would do it separately */
>>>     	if (!amdgpu_sriov_vf(adev))
>>>                     amdgpu_amdkfd_pre_reset(adev);
>>> +
>>> +	return true;
>>>     }
>>>     
>>>     static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>     int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			      struct amdgpu_job *job)
>>>     {
>>> -	int r;
>>> +	struct list_head device_list, *device_list_handle =  NULL;
>>> +	bool need_full_reset, job_signaled;
>>>     	struct amdgpu_hive_info *hive = NULL;
>>> -	bool need_full_reset = false;
>>>     	struct amdgpu_device *tmp_adev = NULL;
>>> -	struct list_head device_list, *device_list_handle =  NULL;
>>> +	int i, r = 0;
>>>     
>>> +	need_full_reset = job_signaled = false;
>>>     	INIT_LIST_HEAD(&device_list);
>>>     
>>>     	dev_info(adev->dev, "GPU reset begin!\n");
>>>     
>>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>>> +
>>>     	/*
>>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>>> -	 * by different nodes. No point also since the one node already executing
>>> -	 * reset will also reset all the other nodes in the hive.
>>> +	 * Here we trylock to avoid chain of resets executing from
>>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>>> +	 * different schedulers for same device while this TO handler is running.
>>> +	 * We always reset all schedulers for device and all devices for XGMI
>>> +	 * hive so that should take care of them too.
>>>     	 */
>>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>> -	    !mutex_trylock(&hive->reset_lock))
>>> +
>>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>>> +			 job->base.id, hive->hive_id);
>>>     		return 0;
>>> +	}
>>>     
>>>     	/* Start with adev pre asic reset first for soft reset check.*/
>>> -	amdgpu_device_lock_adev(adev);
>>> -	r = amdgpu_device_pre_asic_reset(adev,
>>> -					 job,
>>> -					 &need_full_reset);
>>> -	if (r) {
>>> -		/*TODO Should we stop ?*/
>>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> -			  r, adev->ddev->unique);
>>> -		adev->asic_reset_res = r;
>>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>> +					 job->base.id);
>>> +		return 0;
>>>     	}
>>>     
>>>     	/* Build list of devices to reset */
>>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>     		if (!hive) {
>>>     			amdgpu_device_unlock_adev(adev);
>>>     			return -ENODEV;
>>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		device_list_handle = &device_list;
>>>     	}
>>>     
>>> +	/* block all schedulers and reset given job's ring */
>>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			drm_sched_stop(&ring->sched, &job->base);
>>> +		}
>>> +	}
>>> +
>>> +
>>> +	/*
>>> +	 * Must check guilty signal here since after this point all old
>>> +	 * HW fences are force signaled.
>>> +	 *
>>> +	 * job->base holds a reference to parent fence
>>> +	 */
>>> +	if (job && job->base.s_fence->parent &&
>>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>>> +		job_signaled = true;
>>> +
>>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>>> +		device_list_handle = &device_list;
>>> +
>>> +	if (job_signaled) {
>>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>> +		goto skip_hw_reset;
>>> +	}
>>> +
>>> +
>>> +	/* Guilty job will be freed after this*/
>>> +	r = amdgpu_device_pre_asic_reset(adev,
>>> +					 job,
>>> +					 &need_full_reset);
>>> +	if (r) {
>>> +		/*TODO Should we stop ?*/
>>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> +			  r, adev->ddev->unique);
>>> +		adev->asic_reset_res = r;
>>> +	}
>>> +
>>>     retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>     
>>>     		if (tmp_adev == adev)
>>>     			continue;
>>>     
>>> -		amdgpu_device_lock_adev(tmp_adev);
>>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>>     		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>     						 NULL,
>>>     						 &need_full_reset);
>>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			goto retry;
>>>     	}
>>>     
>>> +skip_hw_reset:
>>> +
>>>     	/* Post ASIC reset for all devs .*/
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> -		amdgpu_device_post_asic_reset(tmp_adev);
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			/* No point to resubmit jobs if we didn't HW reset*/
>>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +				drm_sched_resubmit_jobs(&ring->sched);
>>> +
>>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +		}
>>> +
>>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>>> +		}
>>> +
>>> +		tmp_adev->asic_reset_res = 0;
>>>     
>>>     		if (r) {
>>>     			/* bad news, how to tell it to userspace ? */
>>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		amdgpu_device_unlock_adev(tmp_adev);
>>>     	}
>>>     
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>> +	if (hive)
>>>     		mutex_unlock(&hive->reset_lock);
>>>     
>>>     	if (r)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
121.189757 <    0.000171>] amdgpu 0000:01:00.0: GPU reset(5) succeeded!
passed[  121.189894 <    0.000137>] ==================================================================


[  121.189951 <    0.000057>] BUG: KASAN: use-after-free in drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
Run Summary:    Type  Total    Ran Passed Failed Inactive
              suites      8      0    n/a      0        0
               tests     39      1      1      0        0
             asserts      8      8      8      0      n/a

Elapsed time =    0.001 seconds[  121.189956 <    0.000005>] Read of size 8 at addr ffff88840389a8b0 by task kworker/2:2/1140


[  121.189969 <    0.000013>] CPU: 2 PID: 1140 Comm: kworker/2:2 Tainted: G           OE     5.1.0-rc2-misc+ #1
[  121.189972 <    0.000003>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[  121.189977 <    0.000005>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[  121.189980 <    0.000003>] Call Trace:
[  121.189985 <    0.000005>]  dump_stack+0x9b/0xf5
[  121.189992 <    0.000007>]  print_address_description+0x70/0x290
[  121.189997 <    0.000005>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190002 <    0.000005>]  kasan_report+0x134/0x191
[  121.190006 <    0.000004>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190014 <    0.000008>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190019 <    0.000005>]  __asan_load8+0x54/0x90
[  121.190024 <    0.000005>]  drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190034 <    0.000010>]  process_one_work+0x466/0xb00
[  121.190046 <    0.000012>]  ? queue_work_node+0x180/0x180
[  121.190061 <    0.000015>]  worker_thread+0x83/0x6c0
[  121.190075 <    0.000014>]  kthread+0x1a9/0x1f0
[  121.190079 <    0.000004>]  ? rescuer_thread+0x760/0x760
[  121.190081 <    0.000002>]  ? kthread_cancel_delayed_work_sync+0x20/0x20
[  121.190088 <    0.000007>]  ret_from_fork+0x3a/0x50

[  121.190105 <    0.000017>] Allocated by task 1421:
[  121.190110 <    0.000005>]  save_stack+0x46/0xd0
[  121.190112 <    0.000002>]  __kasan_kmalloc+0xab/0xe0
[  121.190115 <    0.000003>]  kasan_kmalloc+0xf/0x20
[  121.190117 <    0.000002>]  __kmalloc+0x167/0x390
[  121.190210 <    0.000093>]  amdgpu_job_alloc+0x47/0x170 [amdgpu]
[  121.190289 <    0.000079>]  amdgpu_cs_ioctl+0x9bd/0x2e70 [amdgpu]
[  121.190312 <    0.000023>]  drm_ioctl_kernel+0x17e/0x1d0 [drm]
[  121.190334 <    0.000022>]  drm_ioctl+0x5e1/0x640 [drm]
[  121.190409 <    0.000075>]  amdgpu_drm_ioctl+0x78/0xd0 [amdgpu]
[  121.190413 <    0.000004>]  do_vfs_ioctl+0x152/0xa30
[  121.190415 <    0.000002>]  ksys_ioctl+0x6d/0x80
[  121.190418 <    0.000003>]  __x64_sys_ioctl+0x43/0x50
[  121.190425 <    0.000007>]  do_syscall_64+0x7d/0x240
[  121.190430 <    0.000005>]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  121.190440 <    0.000010>] Freed by task 1242:
[  121.190448 <    0.000008>]  save_stack+0x46/0xd0
[  121.190453 <    0.000005>]  __kasan_slab_free+0x13c/0x1a0
[  121.190458 <    0.000005>]  kasan_slab_free+0xe/0x10
[  121.190462 <    0.000004>]  kfree+0xfa/0x2e0
[  121.190584 <    0.000122>]  amdgpu_job_free_cb+0x7f/0x90 [amdgpu]
[  121.190589 <    0.000005>]  drm_sched_cleanup_jobs.part.10+0xcf/0x1a0 [gpu_sched]
[  121.190594 <    0.000005>]  drm_sched_main+0x38a/0x430 [gpu_sched]
[  121.190596 <    0.000002>]  kthread+0x1a9/0x1f0
[  121.190599 <    0.000003>]  ret_from_fork+0x3a/0x50
On 4/16/19 12:00 PM, Koenig, Christian wrote:
> Am 16.04.19 um 17:42 schrieb Grodzovsky, Andrey:
>> On 4/16/19 10:58 AM, Grodzovsky, Andrey wrote:
>>> On 4/16/19 10:43 AM, Koenig, Christian wrote:
>>>> Am 16.04.19 um 16:36 schrieb Grodzovsky, Andrey:
>>>>> On 4/16/19 5:47 AM, Christian König wrote:
>>>>>> Am 15.04.19 um 23:17 schrieb Eric Anholt:
>>>>>>> Andrey Grodzovsky <andrey.grodzovsky@amd.com> writes:
>>>>>>>
>>>>>>>> From: Christian König <christian.koenig@amd.com>
>>>>>>>>
>>>>>>>> We now destroy finished jobs from the worker thread to make sure that
>>>>>>>> we never destroy a job currently in timeout processing.
>>>>>>>> By this we avoid holding lock around ring mirror list in drm_sched_stop
>>>>>>>> which should solve a deadlock reported by a user.
>>>>>>>>
>>>>>>>> v2: Remove unused variable.
>>>>>>>>
>>>>>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>>>>>>>>
>>>>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> ---
>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 ++--
>>>>>>>>        drivers/gpu/drm/etnaviv/etnaviv_dump.c     |   4 -
>>>>>>>>        drivers/gpu/drm/etnaviv/etnaviv_sched.c    |   9 +-
>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 138
>>>>>>>> +++++++++++++++++------------
>>>>>>>>        drivers/gpu/drm/v3d/v3d_sched.c            |   9 +-
>>>>>>> Missing corresponding panfrost and lima updates.  You should probably
>>>>>>> pull in drm-misc for hacking on the scheduler.
>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> index ce7c737b..8efb091 100644
>>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> @@ -232,11 +232,18 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d,
>>>>>>>> struct drm_sched_job *sched_job)
>>>>>>>>              /* block scheduler */
>>>>>>>>            for (q = 0; q < V3D_MAX_QUEUES; q++)
>>>>>>>> -        drm_sched_stop(&v3d->queue[q].sched);
>>>>>>>> +        drm_sched_stop(&v3d->queue[q].sched, sched_job);
>>>>>>>>              if(sched_job)
>>>>>>>>                drm_sched_increase_karma(sched_job);
>>>>>>>>        +    /*
>>>>>>>> +     * Guilty job did complete and hence needs to be manually removed
>>>>>>>> +     * See drm_sched_stop doc.
>>>>>>>> +     */
>>>>>>>> +    if (list_empty(&sched_job->node))
>>>>>>>> +        sched_job->sched->ops->free_job(sched_job);
>>>>>>> If the if (sched_job) is necessary up above, then this should clearly be
>>>>>>> under it.
>>>>>>>
>>>>>>> But, can we please have a core scheduler thing we call here instead of
>>>>>>> drivers all replicating it?
>>>>>> Yeah that's also something I noted before.
>>>>>>
>>>>>> Essential problem is that we remove finished jobs from the mirror list
>>>>>> and so need to destruct them because we otherwise leak them.
>>>>>>
>>>>>> Alternative approach here would be to keep the jobs on the ring mirror
>>>>>> list, but not submit them again.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>> I really prefer to avoid this, it means adding extra flag to sched_job
>>>>> to check in each iteration of the ring mirror list.
>>>> Mhm, why actually? We just need to check if the scheduler fence is signaled.
>>> OK, i see it's equivalent but this still en extra check for all the
>>> iterations.
>>>
>>>>> What about changing
>>>>> signature of drm_sched_backend_ops.timedout_job to return drm_sched_job*
>>>>> instead of void, this way we can return the guilty job back from the
>>>>> driver specific handler to the generic drm_sched_job_timedout and
>>>>> release it there.
>>>> Well the timeout handler already has the job, so returning it doesn't
>>>> make much sense.
>>>>
>>>> The problem is rather that the timeout handler doesn't know if it should
>>>> destroy the job or not.
>>> But the driver specific handler does, and actually returning back either
>>> the pointer to the job or null will give an indication of that. We can
>>> even return bool.
>>>
>>> Andrey
>> Thinking a bit more about this - the way this check is done now "if
>> (list_empty(&sched_job->node)) then free the sched_job" actually makes
>> it possible to just move this as is from driver specific callbacks into
>> drm_sched_job_timeout without any other changes.
> Oh, well that sounds like a good idea off hand.
>
> Need to see the final code, but at least the best idea so far.
>
> Christian.

Unfortunately looks like it's not that good idea at the end, take a look 
at the attached KASAN print - sched thread's cleanup function races 
against TDR handler and removes the guilty job from mirror list and we 
have no way of differentiating if the job was removed from within the 
TDR handler or from the sched. thread's clean-up function. So looks like 
we either need 'keep the jobs on the ring mirror list, but not submit 
them again' as you suggested before or add a flag to sched_job to hint 
to drm_sched_job_timedout that guilty job requires manual removal. Your 
suggestion implies we will need an extra check in almost every place of 
traversal of the mirror ring to avoid handling signaled jobs while mine 
requires extra flag in sched_job struct . I feel that keeping completed 
jobs in the mirror list when they actually don't belong there any more 
is confusing and an opening for future bugs.

Andrey

>
>> Andrey
>>
>>>> Christian.
>>>>
>>>>> Andrey
>>>>>
>>>>>>>> +
>>>>>>>>            /* get the GPU back into the init state */
>>>>>>>>            v3d_reset(v3d);
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
121.189757 <    0.000171>] amdgpu 0000:01:00.0: GPU reset(5) succeeded!
passed[  121.189894 <    0.000137>] ==================================================================


[  121.189951 <    0.000057>] BUG: KASAN: use-after-free in drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
Run Summary:    Type  Total    Ran Passed Failed Inactive
              suites      8      0    n/a      0        0
               tests     39      1      1      0        0
             asserts      8      8      8      0      n/a

Elapsed time =    0.001 seconds[  121.189956 <    0.000005>] Read of size 8 at addr ffff88840389a8b0 by task kworker/2:2/1140


[  121.189969 <    0.000013>] CPU: 2 PID: 1140 Comm: kworker/2:2 Tainted: G           OE     5.1.0-rc2-misc+ #1
[  121.189972 <    0.000003>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[  121.189977 <    0.000005>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[  121.189980 <    0.000003>] Call Trace:
[  121.189985 <    0.000005>]  dump_stack+0x9b/0xf5
[  121.189992 <    0.000007>]  print_address_description+0x70/0x290
[  121.189997 <    0.000005>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190002 <    0.000005>]  kasan_report+0x134/0x191
[  121.190006 <    0.000004>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190014 <    0.000008>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190019 <    0.000005>]  __asan_load8+0x54/0x90
[  121.190024 <    0.000005>]  drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190034 <    0.000010>]  process_one_work+0x466/0xb00
[  121.190046 <    0.000012>]  ? queue_work_node+0x180/0x180
[  121.190061 <    0.000015>]  worker_thread+0x83/0x6c0
[  121.190075 <    0.000014>]  kthread+0x1a9/0x1f0
[  121.190079 <    0.000004>]  ? rescuer_thread+0x760/0x760
[  121.190081 <    0.000002>]  ? kthread_cancel_delayed_work_sync+0x20/0x20
[  121.190088 <    0.000007>]  ret_from_fork+0x3a/0x50

[  121.190105 <    0.000017>] Allocated by task 1421:
[  121.190110 <    0.000005>]  save_stack+0x46/0xd0
[  121.190112 <    0.000002>]  __kasan_kmalloc+0xab/0xe0
[  121.190115 <    0.000003>]  kasan_kmalloc+0xf/0x20
[  121.190117 <    0.000002>]  __kmalloc+0x167/0x390
[  121.190210 <    0.000093>]  amdgpu_job_alloc+0x47/0x170 [amdgpu]
[  121.190289 <    0.000079>]  amdgpu_cs_ioctl+0x9bd/0x2e70 [amdgpu]
[  121.190312 <    0.000023>]  drm_ioctl_kernel+0x17e/0x1d0 [drm]
[  121.190334 <    0.000022>]  drm_ioctl+0x5e1/0x640 [drm]
[  121.190409 <    0.000075>]  amdgpu_drm_ioctl+0x78/0xd0 [amdgpu]
[  121.190413 <    0.000004>]  do_vfs_ioctl+0x152/0xa30
[  121.190415 <    0.000002>]  ksys_ioctl+0x6d/0x80
[  121.190418 <    0.000003>]  __x64_sys_ioctl+0x43/0x50
[  121.190425 <    0.000007>]  do_syscall_64+0x7d/0x240
[  121.190430 <    0.000005>]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  121.190440 <    0.000010>] Freed by task 1242:
[  121.190448 <    0.000008>]  save_stack+0x46/0xd0
[  121.190453 <    0.000005>]  __kasan_slab_free+0x13c/0x1a0
[  121.190458 <    0.000005>]  kasan_slab_free+0xe/0x10
[  121.190462 <    0.000004>]  kfree+0xfa/0x2e0
[  121.190584 <    0.000122>]  amdgpu_job_free_cb+0x7f/0x90 [amdgpu]
[  121.190589 <    0.000005>]  drm_sched_cleanup_jobs.part.10+0xcf/0x1a0 [gpu_sched]
[  121.190594 <    0.000005>]  drm_sched_main+0x38a/0x430 [gpu_sched]
[  121.190596 <    0.000002>]  kthread+0x1a9/0x1f0
[  121.190599 <    0.000003>]  ret_from_fork+0x3a/0x50
Christian König April 23, 2019, 2:49 p.m. UTC | #7
Am 23.04.19 um 16:12 schrieb Grodzovsky, Andrey:
> On 4/23/19 8:32 AM, Koenig, Christian wrote:
>
>> Well you at least have to give me time till after the holidays to get
>> going again :)
>>
>> Not sure exactly jet why we need patch number 5.
> Probably you missed the mail where I pointed out a bug I found during
> testing - I am  reattaching the mail and the KASAN dump.

Ah, so the job is actually resubmitted and we race with finishing and 
destroying it.

Well that is a really ugly problem we have here, but your solution 
should work.
Christian.

>
> Andrey
>
>
>> And we should probably commit patch #1 and #2.
>>
>> Christian.
>>
>> Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
>>> Ping for patches 3, new patch 5 and patch 6.
>>>
>>> Andrey
>>>
>>> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>>>> Also reject TDRs if another one already running.
>>>>
>>>> v2:
>>>> Stop all schedulers across device and entire XGMI hive before
>>>> force signaling HW fences.
>>>> Avoid passing job_signaled to helper fnctions to keep all the decision
>>>> making about skipping HW reset in one place.
>>>>
>>>> v3:
>>>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>>>> against it's decrement in drm_sched_stop in non HW reset case.
>>>> v4: rebase
>>>> v5: Revert v3 as we do it now in sceduler code.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>>>      1 file changed, 95 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index a0e165c..85f8792 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      		if (!ring || !ring->sched.thread)
>>>>      			continue;
>>>>      
>>>> -		drm_sched_stop(&ring->sched, &job->base);
>>>> -
>>>>      		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>>>      		amdgpu_fence_driver_force_completion(ring);
>>>>      	}
>>>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>      	if(job)
>>>>      		drm_sched_increase_karma(&job->base);
>>>>      
>>>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>>      	if (!amdgpu_sriov_vf(adev)) {
>>>>      
>>>>      		if (!need_full_reset)
>>>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      	return r;
>>>>      }
>>>>      
>>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>>>      {
>>>> -	int i;
>>>> -
>>>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> -		struct amdgpu_ring *ring = adev->rings[i];
>>>> -
>>>> -		if (!ring || !ring->sched.thread)
>>>> -			continue;
>>>> -
>>>> -		if (!adev->asic_reset_res)
>>>> -			drm_sched_resubmit_jobs(&ring->sched);
>>>> +	if (trylock) {
>>>> +		if (!mutex_trylock(&adev->lock_reset))
>>>> +			return false;
>>>> +	} else
>>>> +		mutex_lock(&adev->lock_reset);
>>>>      
>>>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>>> -	}
>>>> -
>>>> -	if (!amdgpu_device_has_dc_support(adev)) {
>>>> -		drm_helper_resume_force_mode(adev->ddev);
>>>> -	}
>>>> -
>>>> -	adev->asic_reset_res = 0;
>>>> -}
>>>> -
>>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>>> -{
>>>> -	mutex_lock(&adev->lock_reset);
>>>>      	atomic_inc(&adev->gpu_reset_counter);
>>>>      	adev->in_gpu_reset = 1;
>>>>      	/* Block kfd: SRIOV would do it separately */
>>>>      	if (!amdgpu_sriov_vf(adev))
>>>>                      amdgpu_amdkfd_pre_reset(adev);
>>>> +
>>>> +	return true;
>>>>      }
>>>>      
>>>>      static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>>      int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      			      struct amdgpu_job *job)
>>>>      {
>>>> -	int r;
>>>> +	struct list_head device_list, *device_list_handle =  NULL;
>>>> +	bool need_full_reset, job_signaled;
>>>>      	struct amdgpu_hive_info *hive = NULL;
>>>> -	bool need_full_reset = false;
>>>>      	struct amdgpu_device *tmp_adev = NULL;
>>>> -	struct list_head device_list, *device_list_handle =  NULL;
>>>> +	int i, r = 0;
>>>>      
>>>> +	need_full_reset = job_signaled = false;
>>>>      	INIT_LIST_HEAD(&device_list);
>>>>      
>>>>      	dev_info(adev->dev, "GPU reset begin!\n");
>>>>      
>>>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>>>> +
>>>>      	/*
>>>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>>>> -	 * by different nodes. No point also since the one node already executing
>>>> -	 * reset will also reset all the other nodes in the hive.
>>>> +	 * Here we trylock to avoid chain of resets executing from
>>>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>>>> +	 * different schedulers for same device while this TO handler is running.
>>>> +	 * We always reset all schedulers for device and all devices for XGMI
>>>> +	 * hive so that should take care of them too.
>>>>      	 */
>>>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>> -	    !mutex_trylock(&hive->reset_lock))
>>>> +
>>>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>>>> +			 job->base.id, hive->hive_id);
>>>>      		return 0;
>>>> +	}
>>>>      
>>>>      	/* Start with adev pre asic reset first for soft reset check.*/
>>>> -	amdgpu_device_lock_adev(adev);
>>>> -	r = amdgpu_device_pre_asic_reset(adev,
>>>> -					 job,
>>>> -					 &need_full_reset);
>>>> -	if (r) {
>>>> -		/*TODO Should we stop ?*/
>>>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> -			  r, adev->ddev->unique);
>>>> -		adev->asic_reset_res = r;
>>>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>>> +					 job->base.id);
>>>> +		return 0;
>>>>      	}
>>>>      
>>>>      	/* Build list of devices to reset */
>>>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>>>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>>      		if (!hive) {
>>>>      			amdgpu_device_unlock_adev(adev);
>>>>      			return -ENODEV;
>>>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      		device_list_handle = &device_list;
>>>>      	}
>>>>      
>>>> +	/* block all schedulers and reset given job's ring */
>>>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>> +
>>>> +			if (!ring || !ring->sched.thread)
>>>> +				continue;
>>>> +
>>>> +			drm_sched_stop(&ring->sched, &job->base);
>>>> +		}
>>>> +	}
>>>> +
>>>> +
>>>> +	/*
>>>> +	 * Must check guilty signal here since after this point all old
>>>> +	 * HW fences are force signaled.
>>>> +	 *
>>>> +	 * job->base holds a reference to parent fence
>>>> +	 */
>>>> +	if (job && job->base.s_fence->parent &&
>>>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>> +		job_signaled = true;
>>>> +
>>>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>>>> +		device_list_handle = &device_list;
>>>> +
>>>> +	if (job_signaled) {
>>>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>>> +		goto skip_hw_reset;
>>>> +	}
>>>> +
>>>> +
>>>> +	/* Guilty job will be freed after this*/
>>>> +	r = amdgpu_device_pre_asic_reset(adev,
>>>> +					 job,
>>>> +					 &need_full_reset);
>>>> +	if (r) {
>>>> +		/*TODO Should we stop ?*/
>>>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>>> +			  r, adev->ddev->unique);
>>>> +		adev->asic_reset_res = r;
>>>> +	}
>>>> +
>>>>      retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>>      
>>>>      		if (tmp_adev == adev)
>>>>      			continue;
>>>>      
>>>> -		amdgpu_device_lock_adev(tmp_adev);
>>>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>>>      		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>>      						 NULL,
>>>>      						 &need_full_reset);
>>>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      			goto retry;
>>>>      	}
>>>>      
>>>> +skip_hw_reset:
>>>> +
>>>>      	/* Post ASIC reset for all devs .*/
>>>>      	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>> -		amdgpu_device_post_asic_reset(tmp_adev);
>>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>> +
>>>> +			if (!ring || !ring->sched.thread)
>>>> +				continue;
>>>> +
>>>> +			/* No point to resubmit jobs if we didn't HW reset*/
>>>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>>>> +				drm_sched_resubmit_jobs(&ring->sched);
>>>> +
>>>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>> +		}
>>>> +
>>>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>>>> +		}
>>>> +
>>>> +		tmp_adev->asic_reset_res = 0;
>>>>      
>>>>      		if (r) {
>>>>      			/* bad news, how to tell it to userspace ? */
>>>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      		amdgpu_device_unlock_adev(tmp_adev);
>>>>      	}
>>>>      
>>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>>> +	if (hive)
>>>>      		mutex_unlock(&hive->reset_lock);
>>>>      
>>>>      	if (r)
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Andrey Grodzovsky April 23, 2019, 2:51 p.m. UTC | #8
On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's 
timeout handler started processing so in this patch we try to protect 
against this by rechecking the HW fence after stopping all SW 
schedulers. We do it BEFORE marking guilty on the job's sched_entity so 
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    		if (!ring || !ring->sched.thread)
>>    			continue;
>>    
>> -		drm_sched_stop(&ring->sched, &job->base);
>> -
>>    		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>    		amdgpu_fence_driver_force_completion(ring);
>>    	}
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	if(job)
>>    		drm_sched_increase_karma(&job->base);
>>    
>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>    	if (!amdgpu_sriov_vf(adev)) {
>>    
>>    		if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>    	return r;
>>    }
>>    
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -	int i;
>> -
>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -		struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -		if (!ring || !ring->sched.thread)
>> -			continue;
>> -
>> -		if (!adev->asic_reset_res)
>> -			drm_sched_resubmit_jobs(&ring->sched);
>> +	if (trylock) {
>> +		if (!mutex_trylock(&adev->lock_reset))
>> +			return false;
>> +	} else
>> +		mutex_lock(&adev->lock_reset);
>>    
>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -	}
>> -
>> -	if (!amdgpu_device_has_dc_support(adev)) {
>> -		drm_helper_resume_force_mode(adev->ddev);
>> -	}
>> -
>> -	adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -	mutex_lock(&adev->lock_reset);
>>    	atomic_inc(&adev->gpu_reset_counter);
>>    	adev->in_gpu_reset = 1;
>>    	/* Block kfd: SRIOV would do it separately */
>>    	if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +	return true;
>>    }
>>    
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    			      struct amdgpu_job *job)
>>    {
>> -	int r;
>> +	struct list_head device_list, *device_list_handle =  NULL;
>> +	bool need_full_reset, job_signaled;
>>    	struct amdgpu_hive_info *hive = NULL;
>> -	bool need_full_reset = false;
>>    	struct amdgpu_device *tmp_adev = NULL;
>> -	struct list_head device_list, *device_list_handle =  NULL;
>> +	int i, r = 0;
>>    
>> +	need_full_reset = job_signaled = false;
>>    	INIT_LIST_HEAD(&device_list);
>>    
>>    	dev_info(adev->dev, "GPU reset begin!\n");
>>    
>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>    	/*
>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>> -	 * by different nodes. No point also since the one node already executing
>> -	 * reset will also reset all the other nodes in the hive.
>> +	 * Here we trylock to avoid chain of resets executing from
>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +	 * different schedulers for same device while this TO handler is running.
>> +	 * We always reset all schedulers for device and all devices for XGMI
>> +	 * hive so that should take care of them too.
>>    	 */
>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -	    !mutex_trylock(&hive->reset_lock))
>> +
>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +			 job->base.id, hive->hive_id);
>>    		return 0;
>> +	}
>>    
>>    	/* Start with adev pre asic reset first for soft reset check.*/
>> -	amdgpu_device_lock_adev(adev);
>> -	r = amdgpu_device_pre_asic_reset(adev,
>> -					 job,
>> -					 &need_full_reset);
>> -	if (r) {
>> -		/*TODO Should we stop ?*/
>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -			  r, adev->ddev->unique);
>> -		adev->asic_reset_res = r;
>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +					 job->base.id);
>> +		return 0;
>>    	}
>>    
>>    	/* Build list of devices to reset */
>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>    		if (!hive) {
>>    			amdgpu_device_unlock_adev(adev);
>>    			return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		device_list_handle = &device_list;
>>    	}
>>    
>> +	/* block all schedulers and reset given job's ring */
>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +			if (!ring || !ring->sched.thread)
>> +				continue;
>> +
>> +			drm_sched_stop(&ring->sched, &job->base);
>> +		}
>> +	}
>> +
>> +
>> +	/*
>> +	 * Must check guilty signal here since after this point all old
>> +	 * HW fences are force signaled.
>> +	 *
>> +	 * job->base holds a reference to parent fence
>> +	 */
>> +	if (job && job->base.s_fence->parent &&
>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>> +		job_signaled = true;
>> +
>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>> +		device_list_handle = &device_list;
>> +
>> +	if (job_signaled) {
>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +		goto skip_hw_reset;
>> +	}
>> +
>> +
>> +	/* Guilty job will be freed after this*/
>> +	r = amdgpu_device_pre_asic_reset(adev,
>> +					 job,
>> +					 &need_full_reset);
>> +	if (r) {
>> +		/*TODO Should we stop ?*/
>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +			  r, adev->ddev->unique);
>> +		adev->asic_reset_res = r;
>> +	}
>> +
>>    retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>    
>>    		if (tmp_adev == adev)
>>    			continue;
>>    
>> -		amdgpu_device_lock_adev(tmp_adev);
>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>    		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>    						 NULL,
>>    						 &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    			goto retry;
>>    	}
>>    
>> +skip_hw_reset:
>> +
>>    	/* Post ASIC reset for all devs .*/
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -		amdgpu_device_post_asic_reset(tmp_adev);
>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +			if (!ring || !ring->sched.thread)
>> +				continue;
>> +
>> +			/* No point to resubmit jobs if we didn't HW reset*/
>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>> +				drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +		}
>> +
>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>> +		}
>> +
>> +		tmp_adev->asic_reset_res = 0;
>>    
>>    		if (r) {
>>    			/* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		amdgpu_device_unlock_adev(tmp_adev);
>>    	}
>>    
>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +	if (hive)
>>    		mutex_unlock(&hive->reset_lock);
>>    
>>    	if (r)
Chunming Zhou April 23, 2019, 3:19 p.m. UTC | #9
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"


On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Andrey Grodzovsky April 23, 2019, 3:59 p.m. UTC | #10
No, i mean the actual HW fence which signals when the job finished execution on the HW.

Andrey

On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"


On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><mailto:andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Chunming Zhou April 24, 2019, 3:02 a.m. UTC | #11
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }

HW fence are already forced completion, then we can just disable irq fence process and ignore hw fence signal when we are trying to do GPU reset, I think. Otherwise which will make the logic much more complex.
If this situation happens because of long time execution, we can increase timeout of reset detection.

-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Wednesday, April 24, 2019 12:00 AM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; eric@anholt.net; etnaviv@lists.freedesktop.org; ckoenig.leichtzumerken@gmail.com
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


No, i mean the actual HW fence which signals when the job finished execution on the HW.

Andrey
On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"

On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><mailto:andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Christian König April 24, 2019, 7:09 a.m. UTC | #12
Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>
> >> - drm_sched_stop(&ring->sched, &job->base);
> >> -
> >>               /* after all hw jobs are reset, hw fence is 
> meaningless, so force_completion */
> >> amdgpu_fence_driver_force_completion(ring);
> >>       }
>
> HW fence are already forced completion, then we can just disable irq 
> fence process and ignore hw fence signal when we are trying to do GPU 
> reset, I think. Otherwise which will make the logic much more complex.
>
> If this situation happens because of long time execution, we can 
> increase timeout of reset detection.
>

You are not thinking widely enough, forcing the hw fence to complete can 
trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do any 
processing any more and then start with our reset procedure including 
forcing all hw fences to complete.

Christian.

> -David
>
> *From:*amd-gfx <amd-gfx-bounces@lists.freedesktop.org> *On Behalf Of 
> *Grodzovsky, Andrey
> *Sent:* Wednesday, April 24, 2019 12:00 AM
> *To:* Zhou, David(ChunMing) <David1.Zhou@amd.com>; 
> dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; 
> eric@anholt.net; etnaviv@lists.freedesktop.org; 
> ckoenig.leichtzumerken@gmail.com
> *Cc:* Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>
> *Subject:* Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job 
> already signaled.
>
> No, i mean the actual HW fence which signals when the job finished 
> execution on the HW.
>
> Andrey
>
> On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
>
>     do you mean fence timer? why not stop it as well when stopping
>     sched for the reason of hw reset?
>
>     -------- Original Message --------
>     Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty
>     job already signaled.
>     From: "Grodzovsky, Andrey"
>     To: "Zhou, David(ChunMing)"
>     ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com
>     <mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
>     CC: "Kazlauskas, Nicholas" ,"Liu, Monk"
>
>
>     On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
>     > +Monk.
>     >
>     > GPU reset is used widely in SRIOV, so need virtulizatino guy
>     take a look.
>     >
>     > But out of curious, why guilty job can signal more if the job is
>     already
>     > set to guilty? set it wrongly?
>     >
>     >
>     > -David
>
>
>     It's possible that the job does completes at a later time then it's
>     timeout handler started processing so in this patch we try to protect
>     against this by rechecking the HW fence after stopping all SW
>     schedulers. We do it BEFORE marking guilty on the job's
>     sched_entity so
>     at the point we check the guilty flag is not set yet.
>
>     Andrey
>
>
>     >
>     > 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>     >> Also reject TDRs if another one already running.
>     >>
>     >> v2:
>     >> Stop all schedulers across device and entire XGMI hive before
>     >> force signaling HW fences.
>     >> Avoid passing job_signaled to helper fnctions to keep all the
>     decision
>     >> making about skipping HW reset in one place.
>     >>
>     >> v3:
>     >> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to
>     be balanced
>     >> against it's decrement in drm_sched_stop in non HW reset case.
>     >> v4: rebase
>     >> v5: Revert v3 as we do it now in sceduler code.
>     >>
>     >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>     <mailto:andrey.grodzovsky@amd.com>
>     >> ---
>     >>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143
>     +++++++++++++++++++----------
>     >>    1 file changed, 95 insertions(+), 48 deletions(-)
>     >>
>     >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     >> index a0e165c..85f8792 100644
>     >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     >> @@ -3334,8 +3334,6 @@ static int
>     amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>     >>               if (!ring || !ring->sched.thread)
>     >>                       continue;
>     >>
>     >> - drm_sched_stop(&ring->sched, &job->base);
>     >> -
>     >>               /* after all hw jobs are reset, hw fence is
>     meaningless, so force_completion */
>     >> amdgpu_fence_driver_force_completion(ring);
>     >>       }
>     >> @@ -3343,6 +3341,7 @@ static int
>     amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>     >>       if(job)
>     >> drm_sched_increase_karma(&job->base);
>     >>
>     >> +    /* Don't suspend on bare metal if we are not going to HW
>     reset the ASIC */
>     >>       if (!amdgpu_sriov_vf(adev)) {
>     >>
>     >>               if (!need_full_reset)
>     >> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct
>     amdgpu_hive_info *hive,
>     >>       return r;
>     >>    }
>     >>
>     >> -static void amdgpu_device_post_asic_reset(struct amdgpu_device
>     *adev)
>     >> +static bool amdgpu_device_lock_adev(struct amdgpu_device
>     *adev, bool trylock)
>     >>    {
>     >> -    int i;
>     >> -
>     >> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>     >> -            struct amdgpu_ring *ring = adev->rings[i];
>     >> -
>     >> -            if (!ring || !ring->sched.thread)
>     >> -                    continue;
>     >> -
>     >> -            if (!adev->asic_reset_res)
>     >> - drm_sched_resubmit_jobs(&ring->sched);
>     >> +    if (trylock) {
>     >> +            if (!mutex_trylock(&adev->lock_reset))
>     >> +                    return false;
>     >> +    } else
>     >> + mutex_lock(&adev->lock_reset);
>     >>
>     >> - drm_sched_start(&ring->sched, !adev->asic_reset_res);
>     >> -    }
>     >> -
>     >> -    if (!amdgpu_device_has_dc_support(adev)) {
>     >> - drm_helper_resume_force_mode(adev->ddev);
>     >> -    }
>     >> -
>     >> -    adev->asic_reset_res = 0;
>     >> -}
>     >> -
>     >> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>     >> -{
>     >> -    mutex_lock(&adev->lock_reset);
>     >> atomic_inc(&adev->gpu_reset_counter);
>     >>       adev->in_gpu_reset = 1;
>     >>       /* Block kfd: SRIOV would do it separately */
>     >>       if (!amdgpu_sriov_vf(adev))
>     >> amdgpu_amdkfd_pre_reset(adev);
>     >> +
>     >> +    return true;
>     >>    }
>     >>
>     >>    static void amdgpu_device_unlock_adev(struct amdgpu_device
>     *adev)
>     >> @@ -3538,40 +3521,42 @@ static void
>     amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>     >>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>     >>                             struct amdgpu_job *job)
>     >>    {
>     >> -    int r;
>     >> +    struct list_head device_list, *device_list_handle =  NULL;
>     >> +    bool need_full_reset, job_signaled;
>     >>       struct amdgpu_hive_info *hive = NULL;
>     >> -    bool need_full_reset = false;
>     >>       struct amdgpu_device *tmp_adev = NULL;
>     >> -    struct list_head device_list, *device_list_handle =  NULL;
>     >> +    int i, r = 0;
>     >>
>     >> +    need_full_reset = job_signaled = false;
>     >>       INIT_LIST_HEAD(&device_list);
>     >>
>     >>       dev_info(adev->dev, "GPU reset begin!\n");
>     >>
>     >> +    hive = amdgpu_get_xgmi_hive(adev, false);
>     >> +
>     >>       /*
>     >> -     * In case of XGMI hive disallow concurrent resets to be
>     triggered
>     >> -     * by different nodes. No point also since the one node
>     already executing
>     >> -     * reset will also reset all the other nodes in the hive.
>     >> +     * Here we trylock to avoid chain of resets executing from
>     >> +     * either trigger by jobs on different adevs in XGMI hive
>     or jobs on
>     >> +     * different schedulers for same device while this TO
>     handler is running.
>     >> +     * We always reset all schedulers for device and all
>     devices for XGMI
>     >> +     * hive so that should take care of them too.
>     >>        */
>     >> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>     >> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>     >> - !mutex_trylock(&hive->reset_lock))
>     >> +
>     >> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>     >> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive:
>     %llx as another already in progress",
>     >> +                     job->base.id, hive->hive_id);
>     >>               return 0;
>     >> +    }
>     >>
>     >>       /* Start with adev pre asic reset first for soft reset
>     check.*/
>     >> -    amdgpu_device_lock_adev(adev);
>     >> -    r = amdgpu_device_pre_asic_reset(adev,
>     >> -                                     job,
>     >> - &need_full_reset);
>     >> -    if (r) {
>     >> -            /*TODO Should we stop ?*/
>     >> -            DRM_ERROR("GPU pre asic reset failed with err, %d
>     for drm dev, %s ",
>     >> -                      r, adev->ddev->unique);
>     >> -            adev->asic_reset_res = r;
>     >> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>     >> +            DRM_INFO("Bailing on TDR for s_job:%llx, as
>     another already in progress",
>     >> + job->base.id);
>     >> +            return 0;
>     >>       }
>     >>
>     >>       /* Build list of devices to reset */
>     >> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes
>     > 1) {
>     >> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>     >>               if (!hive) {
>     >> amdgpu_device_unlock_adev(adev);
>     >>                       return -ENODEV;
>     >> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct
>     amdgpu_device *adev,
>     >>               device_list_handle = &device_list;
>     >>       }
>     >>
>     >> +    /* block all schedulers and reset given job's ring */
>     >> +    list_for_each_entry(tmp_adev, device_list_handle,
>     gmc.xgmi.head) {
>     >> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>     >> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>     >> +
>     >> +                    if (!ring || !ring->sched.thread)
>     >> +                            continue;
>     >> +
>     >> + drm_sched_stop(&ring->sched, &job->base);
>     >> +            }
>     >> +    }
>     >> +
>     >> +
>     >> +    /*
>     >> +     * Must check guilty signal here since after this point
>     all old
>     >> +     * HW fences are force signaled.
>     >> +     *
>     >> +     * job->base holds a reference to parent fence
>     >> +     */
>     >> +    if (job && job->base.s_fence->parent &&
>     >> + dma_fence_is_signaled(job->base.s_fence->parent))
>     >> +            job_signaled = true;
>     >> +
>     >> +    if (!amdgpu_device_ip_need_full_reset(adev))
>     >> +            device_list_handle = &device_list;
>     >> +
>     >> +    if (job_signaled) {
>     >> +            dev_info(adev->dev, "Guilty job already signaled,
>     skipping HW reset");
>     >> +            goto skip_hw_reset;
>     >> +    }
>     >> +
>     >> +
>     >> +    /* Guilty job will be freed after this*/
>     >> +    r = amdgpu_device_pre_asic_reset(adev,
>     >> +                                     job,
>     >> + &need_full_reset);
>     >> +    if (r) {
>     >> +            /*TODO Should we stop ?*/
>     >> +            DRM_ERROR("GPU pre asic reset failed with err, %d
>     for drm dev, %s ",
>     >> +                      r, adev->ddev->unique);
>     >> +            adev->asic_reset_res = r;
>     >> +    }
>     >> +
>     >>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>     >>       list_for_each_entry(tmp_adev, device_list_handle,
>     gmc.xgmi.head) {
>     >>
>     >>               if (tmp_adev == adev)
>     >>                       continue;
>     >>
>     >> -            amdgpu_device_lock_adev(tmp_adev);
>     >> +            amdgpu_device_lock_adev(tmp_adev, false);
>     >>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>     >> NULL,
>     >> &need_full_reset);
>     >> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct
>     amdgpu_device *adev,
>     >>                       goto retry;
>     >>       }
>     >>
>     >> +skip_hw_reset:
>     >> +
>     >>       /* Post ASIC reset for all devs .*/
>     >>       list_for_each_entry(tmp_adev, device_list_handle,
>     gmc.xgmi.head) {
>     >> - amdgpu_device_post_asic_reset(tmp_adev);
>     >> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>     >> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>     >> +
>     >> +                    if (!ring || !ring->sched.thread)
>     >> +                            continue;
>     >> +
>     >> +                    /* No point to resubmit jobs if we didn't
>     HW reset*/
>     >> +                    if (!tmp_adev->asic_reset_res &&
>     !job_signaled)
>     >> + drm_sched_resubmit_jobs(&ring->sched);
>     >> +
>     >> + drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>     >> +            }
>     >> +
>     >> +            if (!amdgpu_device_has_dc_support(tmp_adev) &&
>     !job_signaled) {
>     >> + drm_helper_resume_force_mode(tmp_adev->ddev);
>     >> +            }
>     >> +
>     >> +            tmp_adev->asic_reset_res = 0;
>     >>
>     >>               if (r) {
>     >>                       /* bad news, how to tell it to userspace ? */
>     >> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct
>     amdgpu_device *adev,
>     >> amdgpu_device_unlock_adev(tmp_adev);
>     >>       }
>     >>
>     >> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>     >> +    if (hive)
>     >> mutex_unlock(&hive->reset_lock);
>     >>
>     >>       if (r)
>     _______________________________________________
>     amd-gfx mailing list
>     amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">Am 24.04.19 um 05:02 schrieb Zhou,
      David(ChunMing):<br>
    </div>
    <blockquote type="cite"
cite="mid:MN2PR12MB2910C37264B8F30BAD26FA4CB43C0@MN2PR12MB2910.namprd12.prod.outlook.com">
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      <meta name="Generator" content="Microsoft Word 15 (filtered
        medium)">
      <style><!--
/* Font Definitions */
@font-face
	{font-family:宋体;
	panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
	{font-family:"\@宋体";
	panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
	{font-family:微软雅黑;
	panose-1:2 11 5 3 2 2 4 2 2 4;}
@font-face
	{font-family:"\@微软雅黑";
	panose-1:2 11 5 3 2 2 4 2 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0in;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:blue;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:purple;
	text-decoration:underline;}
p.msonormal0, li.msonormal0, div.msonormal0
	{mso-style-name:msonormal;
	mso-margin-top-alt:auto;
	margin-right:0in;
	mso-margin-bottom-alt:auto;
	margin-left:0in;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
p.emailquote, li.emailquote, div.emailquote
	{mso-style-name:emailquote;
	mso-margin-top-alt:auto;
	margin-right:0in;
	mso-margin-bottom-alt:auto;
	margin-left:1.0pt;
	border:none;
	padding:0in;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
span.EmailStyle20
	{mso-style-type:personal-reply;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-size:10.0pt;}
@page WordSection1
	{size:8.5in 11.0in;
	margin:1.0in 1.25in 1.0in 1.25in;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
      <div class="WordSection1">
        <p class="MsoNormal">&gt;&gt; -           
          drm_sched_stop(&amp;ring-&gt;sched, &amp;job-&gt;base);<br>
          &gt;&gt; -<br>
          &gt;&gt;               /* after all hw jobs are reset, hw
          fence is meaningless, so force_completion */<br>
          &gt;&gt;              
          amdgpu_fence_driver_force_completion(ring);<br>
          &gt;&gt;       }<o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal">HW fence are already forced completion,
          then we can just disable irq fence process and ignore hw fence
          signal when we are trying to do GPU reset, I think. Otherwise
          which will make the logic much more complex.<o:p></o:p></p>
        <p class="MsoNormal"><span style="color:windowtext">If this
            situation happens because of long time execution, we can
            increase timeout of reset detection.</span></p>
      </div>
    </blockquote>
    <br>
    You are not thinking widely enough, forcing the hw fence to complete
    can trigger other to start other activity in the system.<br>
    <br>
    We first need to stop everything and make sure that we don't do any
    processing any more and then start with our reset procedure
    including forcing all hw fences to complete.<br>
    <br>
    Christian.<br>
    <br>
    <blockquote type="cite"
cite="mid:MN2PR12MB2910C37264B8F30BAD26FA4CB43C0@MN2PR12MB2910.namprd12.prod.outlook.com">
      <div class="WordSection1">
        <p class="MsoNormal"><span style="color:windowtext"><o:p></o:p></span></p>
        <p class="MsoNormal"><span style="color:windowtext"><o:p> </o:p></span></p>
        <p class="MsoNormal"><span style="color:windowtext">-David<o:p></o:p></span></p>
        <p class="MsoNormal"><span style="color:windowtext"><o:p> </o:p></span></p>
        <div style="border:none;border-left:solid blue 1.5pt;padding:0in
          0in 0in 4.0pt">
          <div>
            <div style="border:none;border-top:solid #E1E1E1
              1.0pt;padding:3.0pt 0in 0in 0in">
              <p class="MsoNormal"><b><span style="color:windowtext">From:</span></b><span
                  style="color:windowtext"> amd-gfx
                  <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx-bounces@lists.freedesktop.org">&lt;amd-gfx-bounces@lists.freedesktop.org&gt;</a>
                  <b>On Behalf Of </b>Grodzovsky, Andrey<br>
                  <b>Sent:</b> Wednesday, April 24, 2019 12:00 AM<br>
                  <b>To:</b> Zhou, David(ChunMing)
                  <a class="moz-txt-link-rfc2396E" href="mailto:David1.Zhou@amd.com">&lt;David1.Zhou@amd.com&gt;</a>;
                  <a class="moz-txt-link-abbreviated" href="mailto:dri-devel@lists.freedesktop.org">dri-devel@lists.freedesktop.org</a>;
                  <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>; <a class="moz-txt-link-abbreviated" href="mailto:eric@anholt.net">eric@anholt.net</a>;
                  <a class="moz-txt-link-abbreviated" href="mailto:etnaviv@lists.freedesktop.org">etnaviv@lists.freedesktop.org</a>;
                  <a class="moz-txt-link-abbreviated" href="mailto:ckoenig.leichtzumerken@gmail.com">ckoenig.leichtzumerken@gmail.com</a><br>
                  <b>Cc:</b> Kazlauskas, Nicholas
                  <a class="moz-txt-link-rfc2396E" href="mailto:Nicholas.Kazlauskas@amd.com">&lt;Nicholas.Kazlauskas@amd.com&gt;</a>; Liu, Monk
                  <a class="moz-txt-link-rfc2396E" href="mailto:Monk.Liu@amd.com">&lt;Monk.Liu@amd.com&gt;</a><br>
                  <b>Subject:</b> Re: [PATCH v5 6/6] drm/amdgpu: Avoid
                  HW reset if guilty job already signaled.<o:p></o:p></span></p>
            </div>
          </div>
          <p class="MsoNormal"><o:p> </o:p></p>
          <p>No, i mean the actual HW fence which signals when the job
            finished execution on the HW.<o:p></o:p></p>
          <p>Andrey<o:p></o:p></p>
          <div>
            <p class="MsoNormal">On 4/23/19 11:19 AM, Zhou,
              David(ChunMing) wrote:<o:p></o:p></p>
          </div>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <div>
              <p class="MsoNormal" style="margin-bottom:12.0pt">do you
                mean fence timer? why not stop it as well when stopping
                sched for the reason of hw reset?<br>
                <br>
                -------- Original Message --------<br>
                Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset
                if guilty job already signaled.<br>
                From: "Grodzovsky, Andrey" <br>
                To: "Zhou, David(ChunMing)" ,<a
href="mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com"
                  moz-do-not-send="true">dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com</a><br>
                CC: "Kazlauskas, Nicholas" ,"Liu, Monk" <o:p></o:p></p>
            </div>
            <div>
              <p class="MsoNormal"><br>
                On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:<br>
                &gt; +Monk.<br>
                &gt;<br>
                &gt; GPU reset is used widely in SRIOV, so need
                virtulizatino guy take a look.<br>
                &gt;<br>
                &gt; But out of curious, why guilty job can signal more
                if the job is already<br>
                &gt; set to guilty? set it wrongly?<br>
                &gt;<br>
                &gt;<br>
                &gt; -David<br>
                <br>
                <br>
                It's possible that the job does completes at a later
                time then it's <br>
                timeout handler started processing so in this patch we
                try to protect <br>
                against this by rechecking the HW fence after stopping
                all SW <br>
                schedulers. We do it BEFORE marking guilty on the job's
                sched_entity so <br>
                at the point we check the guilty flag is not set yet.<br>
                <br>
                Andrey<br>
                <br>
                <br>
                &gt;<br>
                &gt; <span
                  style="font-family:&quot;微软雅黑&quot;,sans-serif"
                  lang="ZH-CN">在</span> 2019/4/18 23:00, Andrey
                Grodzovsky
                <span style="font-family:&quot;微软雅黑&quot;,sans-serif"
                  lang="ZH-CN">写道</span>:<br>
                &gt;&gt; Also reject TDRs if another one already
                running.<br>
                &gt;&gt;<br>
                &gt;&gt; v2:<br>
                &gt;&gt; Stop all schedulers across device and entire
                XGMI hive before<br>
                &gt;&gt; force signaling HW fences.<br>
                &gt;&gt; Avoid passing job_signaled to helper fnctions
                to keep all the decision<br>
                &gt;&gt; making about skipping HW reset in one place.<br>
                &gt;&gt;<br>
                &gt;&gt; v3:<br>
                &gt;&gt; Fix SW sched. hang after non HW reset.
                sched.hw_rq_count has to be balanced<br>
                &gt;&gt; against it's decrement in drm_sched_stop in non
                HW reset case.<br>
                &gt;&gt; v4: rebase<br>
                &gt;&gt; v5: Revert v3 as we do it now in sceduler code.<br>
                &gt;&gt;<br>
                &gt;&gt; Signed-off-by: Andrey Grodzovsky <a
                  href="mailto:andrey.grodzovsky@amd.com"
                  moz-do-not-send="true">&lt;andrey.grodzovsky@amd.com&gt;</a><br>
                &gt;&gt; ---<br>
                &gt;&gt;    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |
                143 +++++++++++++++++++----------<br>
                &gt;&gt;    1 file changed, 95 insertions(+), 48
                deletions(-)<br>
                &gt;&gt;<br>
                &gt;&gt; diff --git
                a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
                b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                &gt;&gt; index a0e165c..85f8792 100644<br>
                &gt;&gt; ---
                a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                &gt;&gt; +++
                b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                &gt;&gt; @@ -3334,8 +3334,6 @@ static int
                amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,<br>
                &gt;&gt;               if (!ring ||
                !ring-&gt;sched.thread)<br>
                &gt;&gt;                       continue;<br>
                &gt;&gt;    <br>
                &gt;&gt; -           
                drm_sched_stop(&amp;ring-&gt;sched, &amp;job-&gt;base);<br>
                &gt;&gt; -<br>
                &gt;&gt;               /* after all hw jobs are reset,
                hw fence is meaningless, so force_completion */<br>
                &gt;&gt;              
                amdgpu_fence_driver_force_completion(ring);<br>
                &gt;&gt;       }<br>
                &gt;&gt; @@ -3343,6 +3341,7 @@ static int
                amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,<br>
                &gt;&gt;       if(job)<br>
                &gt;&gt;              
                drm_sched_increase_karma(&amp;job-&gt;base);<br>
                &gt;&gt;    <br>
                &gt;&gt; +    /* Don't suspend on bare metal if we are
                not going to HW reset the ASIC */<br>
                &gt;&gt;       if (!amdgpu_sriov_vf(adev)) {<br>
                &gt;&gt;    <br>
                &gt;&gt;               if (!need_full_reset)<br>
                &gt;&gt; @@ -3480,37 +3479,21 @@ static int
                amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,<br>
                &gt;&gt;       return r;<br>
                &gt;&gt;    }<br>
                &gt;&gt;    <br>
                &gt;&gt; -static void
                amdgpu_device_post_asic_reset(struct amdgpu_device
                *adev)<br>
                &gt;&gt; +static bool amdgpu_device_lock_adev(struct
                amdgpu_device *adev, bool trylock)<br>
                &gt;&gt;    {<br>
                &gt;&gt; -    int i;<br>
                &gt;&gt; -<br>
                &gt;&gt; -    for (i = 0; i &lt; AMDGPU_MAX_RINGS; ++i)
                {<br>
                &gt;&gt; -            struct amdgpu_ring *ring =
                adev-&gt;rings[i];<br>
                &gt;&gt; -<br>
                &gt;&gt; -            if (!ring ||
                !ring-&gt;sched.thread)<br>
                &gt;&gt; -                    continue;<br>
                &gt;&gt; -<br>
                &gt;&gt; -            if (!adev-&gt;asic_reset_res)<br>
                &gt;&gt; -                   
                drm_sched_resubmit_jobs(&amp;ring-&gt;sched);<br>
                &gt;&gt; +    if (trylock) {<br>
                &gt;&gt; +            if
                (!mutex_trylock(&amp;adev-&gt;lock_reset))<br>
                &gt;&gt; +                    return false;<br>
                &gt;&gt; +    } else<br>
                &gt;&gt; +           
                mutex_lock(&amp;adev-&gt;lock_reset);<br>
                &gt;&gt;    <br>
                &gt;&gt; -           
                drm_sched_start(&amp;ring-&gt;sched,
                !adev-&gt;asic_reset_res);<br>
                &gt;&gt; -    }<br>
                &gt;&gt; -<br>
                &gt;&gt; -    if (!amdgpu_device_has_dc_support(adev)) {<br>
                &gt;&gt; -           
                drm_helper_resume_force_mode(adev-&gt;ddev);<br>
                &gt;&gt; -    }<br>
                &gt;&gt; -<br>
                &gt;&gt; -    adev-&gt;asic_reset_res = 0;<br>
                &gt;&gt; -}<br>
                &gt;&gt; -<br>
                &gt;&gt; -static void amdgpu_device_lock_adev(struct
                amdgpu_device *adev)<br>
                &gt;&gt; -{<br>
                &gt;&gt; -    mutex_lock(&amp;adev-&gt;lock_reset);<br>
                &gt;&gt;      
                atomic_inc(&amp;adev-&gt;gpu_reset_counter);<br>
                &gt;&gt;       adev-&gt;in_gpu_reset = 1;<br>
                &gt;&gt;       /* Block kfd: SRIOV would do it
                separately */<br>
                &gt;&gt;       if (!amdgpu_sriov_vf(adev))<br>
                &gt;&gt;                   
                amdgpu_amdkfd_pre_reset(adev);<br>
                &gt;&gt; +<br>
                &gt;&gt; +    return true;<br>
                &gt;&gt;    }<br>
                &gt;&gt;    <br>
                &gt;&gt;    static void amdgpu_device_unlock_adev(struct
                amdgpu_device *adev)<br>
                &gt;&gt; @@ -3538,40 +3521,42 @@ static void
                amdgpu_device_unlock_adev(struct amdgpu_device *adev)<br>
                &gt;&gt;    int amdgpu_device_gpu_recover(struct
                amdgpu_device *adev,<br>
                &gt;&gt;                             struct amdgpu_job
                *job)<br>
                &gt;&gt;    {<br>
                &gt;&gt; -    int r;<br>
                &gt;&gt; +    struct list_head device_list,
                *device_list_handle =  NULL;<br>
                &gt;&gt; +    bool need_full_reset, job_signaled;<br>
                &gt;&gt;       struct amdgpu_hive_info *hive = NULL;<br>
                &gt;&gt; -    bool need_full_reset = false;<br>
                &gt;&gt;       struct amdgpu_device *tmp_adev = NULL;<br>
                &gt;&gt; -    struct list_head device_list,
                *device_list_handle =  NULL;<br>
                &gt;&gt; +    int i, r = 0;<br>
                &gt;&gt;    <br>
                &gt;&gt; +    need_full_reset = job_signaled = false;<br>
                &gt;&gt;       INIT_LIST_HEAD(&amp;device_list);<br>
                &gt;&gt;    <br>
                &gt;&gt;       dev_info(adev-&gt;dev, "GPU reset
                begin!\n");<br>
                &gt;&gt;    <br>
                &gt;&gt; +    hive = amdgpu_get_xgmi_hive(adev, false);<br>
                &gt;&gt; +<br>
                &gt;&gt;       /*<br>
                &gt;&gt; -     * In case of XGMI hive disallow
                concurrent resets to be triggered<br>
                &gt;&gt; -     * by different nodes. No point also since
                the one node already executing<br>
                &gt;&gt; -     * reset will also reset all the other
                nodes in the hive.<br>
                &gt;&gt; +     * Here we trylock to avoid chain of
                resets executing from<br>
                &gt;&gt; +     * either trigger by jobs on different
                adevs in XGMI hive or jobs on<br>
                &gt;&gt; +     * different schedulers for same device
                while this TO handler is running.<br>
                &gt;&gt; +     * We always reset all schedulers for
                device and all devices for XGMI<br>
                &gt;&gt; +     * hive so that should take care of them
                too.<br>
                &gt;&gt;        */<br>
                &gt;&gt; -    hive = amdgpu_get_xgmi_hive(adev, 0);<br>
                &gt;&gt; -    if (hive &amp;&amp;
                adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1 &amp;&amp;<br>
                &gt;&gt; -       
                !mutex_trylock(&amp;hive-&gt;reset_lock))<br>
                &gt;&gt; +<br>
                &gt;&gt; +    if (hive &amp;&amp;
                !mutex_trylock(&amp;hive-&gt;reset_lock)) {<br>
                &gt;&gt; +            DRM_INFO("Bailing on TDR for
                s_job:%llx, hive: %llx as another already in progress",<br>
                &gt;&gt; +                     job-&gt;base.id,
                hive-&gt;hive_id);<br>
                &gt;&gt;               return 0;<br>
                &gt;&gt; +    }<br>
                &gt;&gt;    <br>
                &gt;&gt;       /* Start with adev pre asic reset first
                for soft reset check.*/<br>
                &gt;&gt; -    amdgpu_device_lock_adev(adev);<br>
                &gt;&gt; -    r = amdgpu_device_pre_asic_reset(adev,<br>
                &gt;&gt; -                                     job,<br>
                &gt;&gt; -                                    
                &amp;need_full_reset);<br>
                &gt;&gt; -    if (r) {<br>
                &gt;&gt; -            /*TODO Should we stop ?*/<br>
                &gt;&gt; -            DRM_ERROR("GPU pre asic reset
                failed with err, %d for drm dev, %s ",<br>
                &gt;&gt; -                      r,
                adev-&gt;ddev-&gt;unique);<br>
                &gt;&gt; -            adev-&gt;asic_reset_res = r;<br>
                &gt;&gt; +    if (!amdgpu_device_lock_adev(adev, !hive))
                {<br>
                &gt;&gt; +            DRM_INFO("Bailing on TDR for
                s_job:%llx, as another already in progress",<br>
                &gt;&gt; +                                    
                job-&gt;base.id);<br>
                &gt;&gt; +            return 0;<br>
                &gt;&gt;       }<br>
                &gt;&gt;    <br>
                &gt;&gt;       /* Build list of devices to reset */<br>
                &gt;&gt; -    if  (need_full_reset &amp;&amp;
                adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1) {<br>
                &gt;&gt; +    if  (adev-&gt;gmc.xgmi.num_physical_nodes
                &gt; 1) {<br>
                &gt;&gt;               if (!hive) {<br>
                &gt;&gt;                      
                amdgpu_device_unlock_adev(adev);<br>
                &gt;&gt;                       return -ENODEV;<br>
                &gt;&gt; @@ -3588,13 +3573,56 @@ int
                amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                &gt;&gt;               device_list_handle =
                &amp;device_list;<br>
                &gt;&gt;       }<br>
                &gt;&gt;    <br>
                &gt;&gt; +    /* block all schedulers and reset given
                job's ring */<br>
                &gt;&gt; +    list_for_each_entry(tmp_adev,
                device_list_handle, gmc.xgmi.head) {<br>
                &gt;&gt; +            for (i = 0; i &lt;
                AMDGPU_MAX_RINGS; ++i) {<br>
                &gt;&gt; +                    struct amdgpu_ring *ring =
                tmp_adev-&gt;rings[i];<br>
                &gt;&gt; +<br>
                &gt;&gt; +                    if (!ring ||
                !ring-&gt;sched.thread)<br>
                &gt;&gt; +                            continue;<br>
                &gt;&gt; +<br>
                &gt;&gt; +                   
                drm_sched_stop(&amp;ring-&gt;sched, &amp;job-&gt;base);<br>
                &gt;&gt; +            }<br>
                &gt;&gt; +    }<br>
                &gt;&gt; +<br>
                &gt;&gt; +<br>
                &gt;&gt; +    /*<br>
                &gt;&gt; +     * Must check guilty signal here since
                after this point all old<br>
                &gt;&gt; +     * HW fences are force signaled.<br>
                &gt;&gt; +     *<br>
                &gt;&gt; +     * job-&gt;base holds a reference to
                parent fence<br>
                &gt;&gt; +     */<br>
                &gt;&gt; +    if (job &amp;&amp;
                job-&gt;base.s_fence-&gt;parent &amp;&amp;<br>
                &gt;&gt; +       
                dma_fence_is_signaled(job-&gt;base.s_fence-&gt;parent))<br>
                &gt;&gt; +            job_signaled = true;<br>
                &gt;&gt; +<br>
                &gt;&gt; +    if
                (!amdgpu_device_ip_need_full_reset(adev))<br>
                &gt;&gt; +            device_list_handle =
                &amp;device_list;<br>
                &gt;&gt; +<br>
                &gt;&gt; +    if (job_signaled) {<br>
                &gt;&gt; +            dev_info(adev-&gt;dev, "Guilty job
                already signaled, skipping HW reset");<br>
                &gt;&gt; +            goto skip_hw_reset;<br>
                &gt;&gt; +    }<br>
                &gt;&gt; +<br>
                &gt;&gt; +<br>
                &gt;&gt; +    /* Guilty job will be freed after this*/<br>
                &gt;&gt; +    r = amdgpu_device_pre_asic_reset(adev,<br>
                &gt;&gt; +                                     job,<br>
                &gt;&gt; +                                    
                &amp;need_full_reset);<br>
                &gt;&gt; +    if (r) {<br>
                &gt;&gt; +            /*TODO Should we stop ?*/<br>
                &gt;&gt; +            DRM_ERROR("GPU pre asic reset
                failed with err, %d for drm dev, %s ",<br>
                &gt;&gt; +                      r,
                adev-&gt;ddev-&gt;unique);<br>
                &gt;&gt; +            adev-&gt;asic_reset_res = r;<br>
                &gt;&gt; +    }<br>
                &gt;&gt; +<br>
                &gt;&gt;    retry:    /* Rest of adevs pre asic reset
                from XGMI hive. */<br>
                &gt;&gt;       list_for_each_entry(tmp_adev,
                device_list_handle, gmc.xgmi.head) {<br>
                &gt;&gt;    <br>
                &gt;&gt;               if (tmp_adev == adev)<br>
                &gt;&gt;                       continue;<br>
                &gt;&gt;    <br>
                &gt;&gt; -            amdgpu_device_lock_adev(tmp_adev);<br>
                &gt;&gt; +            amdgpu_device_lock_adev(tmp_adev,
                false);<br>
                &gt;&gt;               r =
                amdgpu_device_pre_asic_reset(tmp_adev,<br>
                &gt;&gt;                                               
                NULL,<br>
                &gt;&gt;                                               
                &amp;need_full_reset);<br>
                &gt;&gt; @@ -3618,9 +3646,28 @@ int
                amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                &gt;&gt;                       goto retry;<br>
                &gt;&gt;       }<br>
                &gt;&gt;    <br>
                &gt;&gt; +skip_hw_reset:<br>
                &gt;&gt; +<br>
                &gt;&gt;       /* Post ASIC reset for all devs .*/<br>
                &gt;&gt;       list_for_each_entry(tmp_adev,
                device_list_handle, gmc.xgmi.head) {<br>
                &gt;&gt; -           
                amdgpu_device_post_asic_reset(tmp_adev);<br>
                &gt;&gt; +            for (i = 0; i &lt;
                AMDGPU_MAX_RINGS; ++i) {<br>
                &gt;&gt; +                    struct amdgpu_ring *ring =
                tmp_adev-&gt;rings[i];<br>
                &gt;&gt; +<br>
                &gt;&gt; +                    if (!ring ||
                !ring-&gt;sched.thread)<br>
                &gt;&gt; +                            continue;<br>
                &gt;&gt; +<br>
                &gt;&gt; +                    /* No point to resubmit
                jobs if we didn't HW reset*/<br>
                &gt;&gt; +                    if
                (!tmp_adev-&gt;asic_reset_res &amp;&amp; !job_signaled)<br>
                &gt;&gt; +                           
                drm_sched_resubmit_jobs(&amp;ring-&gt;sched);<br>
                &gt;&gt; +<br>
                &gt;&gt; +                   
                drm_sched_start(&amp;ring-&gt;sched,
                !tmp_adev-&gt;asic_reset_res);<br>
                &gt;&gt; +            }<br>
                &gt;&gt; +<br>
                &gt;&gt; +            if
                (!amdgpu_device_has_dc_support(tmp_adev) &amp;&amp;
                !job_signaled) {<br>
                &gt;&gt; +                   
                drm_helper_resume_force_mode(tmp_adev-&gt;ddev);<br>
                &gt;&gt; +            }<br>
                &gt;&gt; +<br>
                &gt;&gt; +            tmp_adev-&gt;asic_reset_res = 0;<br>
                &gt;&gt;    <br>
                &gt;&gt;               if (r) {<br>
                &gt;&gt;                       /* bad news, how to tell
                it to userspace ? */<br>
                &gt;&gt; @@ -3633,7 +3680,7 @@ int
                amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                &gt;&gt;              
                amdgpu_device_unlock_adev(tmp_adev);<br>
                &gt;&gt;       }<br>
                &gt;&gt;    <br>
                &gt;&gt; -    if (hive &amp;&amp;
                adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1)<br>
                &gt;&gt; +    if (hive)<br>
                &gt;&gt;              
                mutex_unlock(&amp;hive-&gt;reset_lock);<br>
                &gt;&gt;    <br>
                &gt;&gt;       if (r)<br>
                _______________________________________________<br>
                amd-gfx mailing list<br>
                <a href="mailto:amd-gfx@lists.freedesktop.org"
                  moz-do-not-send="true">amd-gfx@lists.freedesktop.org</a><br>
                <a
                  href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx"
                  moz-do-not-send="true">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a><o:p></o:p></p>
            </div>
          </blockquote>
        </div>
      </div>
    </blockquote>
    <br>
  </body>
</html>
Andrey Grodzovsky April 26, 2019, 2:08 p.m. UTC | #13
Ping (mostly David and Monk).

Andrey

On 4/24/19 3:09 AM, Christian König wrote:
Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }

HW fence are already forced completion, then we can just disable irq fence process and ignore hw fence signal when we are trying to do GPU reset, I think. Otherwise which will make the logic much more complex.
If this situation happens because of long time execution, we can increase timeout of reset detection.

You are not thinking widely enough, forcing the hw fence to complete can trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do any processing any more and then start with our reset procedure including forcing all hw fences to complete.

Christian.


-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Wednesday, April 24, 2019 12:00 AM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com><mailto:David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; eric@anholt.net<mailto:eric@anholt.net>; etnaviv@lists.freedesktop.org<mailto:etnaviv@lists.freedesktop.org>; ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com><mailto:Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


No, i mean the actual HW fence which signals when the job finished execution on the HW.

Andrey
On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"

On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><mailto:andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Chunming Zhou April 28, 2019, 2:56 a.m. UTC | #14
Sorry, I only can put my Acked-by: Chunming Zhou <david1.zhou@amd.com> on patch#3.

I cannot fully judge patch #4, #5, #6.

-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Friday, April 26, 2019 10:09 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; eric@anholt.net; etnaviv@lists.freedesktop.org
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


Ping (mostly David and Monk).

Andrey
On 4/24/19 3:09 AM, Christian König wrote:
Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }

HW fence are already forced completion, then we can just disable irq fence process and ignore hw fence signal when we are trying to do GPU reset, I think. Otherwise which will make the logic much more complex.
If this situation happens because of long time execution, we can increase timeout of reset detection.

You are not thinking widely enough, forcing the hw fence to complete can trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do any processing any more and then start with our reset procedure including forcing all hw fences to complete.

Christian.



-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Wednesday, April 24, 2019 12:00 AM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com><mailto:David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; eric@anholt.net<mailto:eric@anholt.net>; etnaviv@lists.freedesktop.org<mailto:etnaviv@lists.freedesktop.org>; ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com><mailto:Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


No, i mean the actual HW fence which signals when the job finished execution on the HW.

Andrey
On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"

On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><mailto:andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Andrey Grodzovsky April 29, 2019, 2:14 p.m. UTC | #15
Thanks David, with that only patches 5 and 6 are left for the series to be reviewed.

Christian, any more comments on those patches ?

Andrey

On 4/27/19 10:56 PM, Zhou, David(ChunMing) wrote:
Sorry, I only can put my Acked-by: Chunming Zhou <david1.zhou@amd.com><mailto:david1.zhou@amd.com> on patch#3.

I cannot fully judge patch #4, #5, #6.

-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Friday, April 26, 2019 10:09 PM
To: Koenig, Christian <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com><mailto:David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; eric@anholt.net<mailto:eric@anholt.net>; etnaviv@lists.freedesktop.org<mailto:etnaviv@lists.freedesktop.org>
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com><mailto:Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


Ping (mostly David and Monk).

Andrey
On 4/24/19 3:09 AM, Christian König wrote:
Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }

HW fence are already forced completion, then we can just disable irq fence process and ignore hw fence signal when we are trying to do GPU reset, I think. Otherwise which will make the logic much more complex.
If this situation happens because of long time execution, we can increase timeout of reset detection.

You are not thinking widely enough, forcing the hw fence to complete can trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do any processing any more and then start with our reset procedure including forcing all hw fences to complete.

Christian.



-David

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Grodzovsky, Andrey
Sent: Wednesday, April 24, 2019 12:00 AM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com><mailto:David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>; eric@anholt.net<mailto:eric@anholt.net>; etnaviv@lists.freedesktop.org<mailto:etnaviv@lists.freedesktop.org>; ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>
Cc: Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com><mailto:Nicholas.Kazlauskas@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.


No, i mean the actual HW fence which signals when the job finished execution on the HW.

Andrey
On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the reason of hw reset?

-------- Original Message --------
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"

On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><mailto:andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>    1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a0e165c..85f8792 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>               if (!ring || !ring->sched.thread)
>>                       continue;
>>
>> -            drm_sched_stop(&ring->sched, &job->base);
>> -
>>               /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>               amdgpu_fence_driver_force_completion(ring);
>>       }
>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>       if(job)
>>               drm_sched_increase_karma(&job->base);
>>
>> +    /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>       if (!amdgpu_sriov_vf(adev)) {
>>
>>               if (!need_full_reset)
>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>       return r;
>>    }
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>    {
>> -    int i;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                    continue;
>> -
>> -            if (!adev->asic_reset_res)
>> -                    drm_sched_resubmit_jobs(&ring->sched);
>> +    if (trylock) {
>> +            if (!mutex_trylock(&adev->lock_reset))
>> +                    return false;
>> +    } else
>> +            mutex_lock(&adev->lock_reset);
>>
>> -            drm_sched_start(&ring->sched, !adev->asic_reset_res);
>> -    }
>> -
>> -    if (!amdgpu_device_has_dc_support(adev)) {
>> -            drm_helper_resume_force_mode(adev->ddev);
>> -    }
>> -
>> -    adev->asic_reset_res = 0;
>> -}
>> -
>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>> -{
>> -    mutex_lock(&adev->lock_reset);
>>       atomic_inc(&adev->gpu_reset_counter);
>>       adev->in_gpu_reset = 1;
>>       /* Block kfd: SRIOV would do it separately */
>>       if (!amdgpu_sriov_vf(adev))
>>                    amdgpu_amdkfd_pre_reset(adev);
>> +
>> +    return true;
>>    }
>>
>>    static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>    int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                             struct amdgpu_job *job)
>>    {
>> -    int r;
>> +    struct list_head device_list, *device_list_handle =  NULL;
>> +    bool need_full_reset, job_signaled;
>>       struct amdgpu_hive_info *hive = NULL;
>> -    bool need_full_reset = false;
>>       struct amdgpu_device *tmp_adev = NULL;
>> -    struct list_head device_list, *device_list_handle =  NULL;
>> +    int i, r = 0;
>>
>> +    need_full_reset = job_signaled = false;
>>       INIT_LIST_HEAD(&device_list);
>>
>>       dev_info(adev->dev, "GPU reset begin!\n");
>>
>> +    hive = amdgpu_get_xgmi_hive(adev, false);
>> +
>>       /*
>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>> -     * by different nodes. No point also since the one node already executing
>> -     * reset will also reset all the other nodes in the hive.
>> +     * Here we trylock to avoid chain of resets executing from
>> +     * either trigger by jobs on different adevs in XGMI hive or jobs on
>> +     * different schedulers for same device while this TO handler is running.
>> +     * We always reset all schedulers for device and all devices for XGMI
>> +     * hive so that should take care of them too.
>>        */
>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>> -        !mutex_trylock(&hive->reset_lock))
>> +
>> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                     job->base.id, hive->hive_id);
>>               return 0;
>> +    }
>>
>>       /* Start with adev pre asic reset first for soft reset check.*/
>> -    amdgpu_device_lock_adev(adev);
>> -    r = amdgpu_device_pre_asic_reset(adev,
>> -                                     job,
>> -                                     &need_full_reset);
>> -    if (r) {
>> -            /*TODO Should we stop ?*/
>> -            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> -                      r, adev->ddev->unique);
>> -            adev->asic_reset_res = r;
>> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>> +            DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>> +                                     job->base.id);
>> +            return 0;
>>       }
>>
>>       /* Build list of devices to reset */
>> -    if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>> +    if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>               if (!hive) {
>>                       amdgpu_device_unlock_adev(adev);
>>                       return -ENODEV;
>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               device_list_handle = &device_list;
>>       }
>>
>> +    /* block all schedulers and reset given job's ring */
>> +    list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    drm_sched_stop(&ring->sched, &job->base);
>> +            }
>> +    }
>> +
>> +
>> +    /*
>> +     * Must check guilty signal here since after this point all old
>> +     * HW fences are force signaled.
>> +     *
>> +     * job->base holds a reference to parent fence
>> +     */
>> +    if (job && job->base.s_fence->parent &&
>> +        dma_fence_is_signaled(job->base.s_fence->parent))
>> +            job_signaled = true;
>> +
>> +    if (!amdgpu_device_ip_need_full_reset(adev))
>> +            device_list_handle = &device_list;
>> +
>> +    if (job_signaled) {
>> +            dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>> +            goto skip_hw_reset;
>> +    }
>> +
>> +
>> +    /* Guilty job will be freed after this*/
>> +    r = amdgpu_device_pre_asic_reset(adev,
>> +                                     job,
>> +                                     &need_full_reset);
>> +    if (r) {
>> +            /*TODO Should we stop ?*/
>> +            DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>> +                      r, adev->ddev->unique);
>> +            adev->asic_reset_res = r;
>> +    }
>> +
>>    retry:    /* Rest of adevs pre asic reset from XGMI hive. */
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>
>>               if (tmp_adev == adev)
>>                       continue;
>>
>> -            amdgpu_device_lock_adev(tmp_adev);
>> +            amdgpu_device_lock_adev(tmp_adev, false);
>>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>                                                NULL,
>>                                                &need_full_reset);
>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                       goto retry;
>>       }
>>
>> +skip_hw_reset:
>> +
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            amdgpu_device_post_asic_reset(tmp_adev);
>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +
>> +                    if (!ring || !ring->sched.thread)
>> +                            continue;
>> +
>> +                    /* No point to resubmit jobs if we didn't HW reset*/
>> +                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                            drm_sched_resubmit_jobs(&ring->sched);
>> +
>> +                    drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +            }
>> +
>> +            if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> +                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> +            }
>> +
>> +            tmp_adev->asic_reset_res = 0;
>>
>>               if (r) {
>>                       /* bad news, how to tell it to userspace ? */
>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>               amdgpu_device_unlock_adev(tmp_adev);
>>       }
>>
>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>> +    if (hive)
>>               mutex_unlock(&hive->reset_lock);
>>
>>       if (r)
Christian König April 29, 2019, 7:03 p.m. UTC | #16
I would clean them up further, but that's only moving code around so 
feel free to add my rb to those.

Christian.

Am 29.04.19 um 16:14 schrieb Grodzovsky, Andrey:
>
> Thanks David, with that only patches 5 and 6 are left for the series 
> to be reviewed.
>
> Christian, any more comments on those patches ?
>
> Andrey
>
> On 4/27/19 10:56 PM, Zhou, David(ChunMing) wrote:
>>
>> Sorry, I only can put my Acked-by: Chunming Zhou 
>> <david1.zhou@amd.com> on patch#3.
>>
>> I cannot fully judge patch #4, #5, #6.
>>
>> -David
>>
>> *From:*amd-gfx <amd-gfx-bounces@lists.freedesktop.org> *On Behalf Of 
>> *Grodzovsky, Andrey
>> *Sent:* Friday, April 26, 2019 10:09 PM
>> *To:* Koenig, Christian <Christian.Koenig@amd.com>; Zhou, 
>> David(ChunMing) <David1.Zhou@amd.com>; 
>> dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; 
>> eric@anholt.net; etnaviv@lists.freedesktop.org
>> *Cc:* Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>; Liu, Monk 
>> <Monk.Liu@amd.com>
>> *Subject:* Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty 
>> job already signaled.
>>
>> Ping (mostly David and Monk).
>>
>> Andrey
>>
>> On 4/24/19 3:09 AM, Christian König wrote:
>>
>>     Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>>
>>         >> - drm_sched_stop(&ring->sched, &job->base);
>>         >> -
>>         >>               /* after all hw jobs are reset, hw fence is
>>         meaningless, so force_completion */
>>         >> amdgpu_fence_driver_force_completion(ring);
>>         >>       }
>>
>>         HW fence are already forced completion, then we can just
>>         disable irq fence process and ignore hw fence signal when we
>>         are trying to do GPU reset, I think. Otherwise which will
>>         make the logic much more complex.
>>
>>         If this situation happens because of long time execution, we
>>         can increase timeout of reset detection.
>>
>>
>>     You are not thinking widely enough, forcing the hw fence to
>>     complete can trigger other to start other activity in the system.
>>
>>     We first need to stop everything and make sure that we don't do
>>     any processing any more and then start with our reset procedure
>>     including forcing all hw fences to complete.
>>
>>     Christian.
>>
>>
>>         -David
>>
>>         *From:*amd-gfx <amd-gfx-bounces@lists.freedesktop.org>
>>         <mailto:amd-gfx-bounces@lists.freedesktop.org> *On Behalf Of
>>         *Grodzovsky, Andrey
>>         *Sent:* Wednesday, April 24, 2019 12:00 AM
>>         *To:* Zhou, David(ChunMing) <David1.Zhou@amd.com>
>>         <mailto:David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org
>>         <mailto:dri-devel@lists.freedesktop.org>;
>>         amd-gfx@lists.freedesktop.org
>>         <mailto:amd-gfx@lists.freedesktop.org>; eric@anholt.net
>>         <mailto:eric@anholt.net>; etnaviv@lists.freedesktop.org
>>         <mailto:etnaviv@lists.freedesktop.org>;
>>         ckoenig.leichtzumerken@gmail.com
>>         <mailto:ckoenig.leichtzumerken@gmail.com>
>>         *Cc:* Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>
>>         <mailto:Nicholas.Kazlauskas@amd.com>; Liu, Monk
>>         <Monk.Liu@amd.com> <mailto:Monk.Liu@amd.com>
>>         *Subject:* Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if
>>         guilty job already signaled.
>>
>>         No, i mean the actual HW fence which signals when the job
>>         finished execution on the HW.
>>
>>         Andrey
>>
>>         On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
>>
>>             do you mean fence timer? why not stop it as well when
>>             stopping sched for the reason of hw reset?
>>
>>             -------- Original Message --------
>>             Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if
>>             guilty job already signaled.
>>             From: "Grodzovsky, Andrey"
>>             To: "Zhou, David(ChunMing)"
>>             ,dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com
>>             <mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com>
>>             CC: "Kazlauskas, Nicholas" ,"Liu, Monk"
>>
>>
>>             On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
>>             > +Monk.
>>             >
>>             > GPU reset is used widely in SRIOV, so need
>>             virtulizatino guy take a look.
>>             >
>>             > But out of curious, why guilty job can signal more if
>>             the job is already
>>             > set to guilty? set it wrongly?
>>             >
>>             >
>>             > -David
>>
>>
>>             It's possible that the job does completes at a later time
>>             then it's
>>             timeout handler started processing so in this patch we
>>             try to protect
>>             against this by rechecking the HW fence after stopping
>>             all SW
>>             schedulers. We do it BEFORE marking guilty on the job's
>>             sched_entity so
>>             at the point we check the guilty flag is not set yet.
>>
>>             Andrey
>>
>>
>>             >
>>             > 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>>             >> Also reject TDRs if another one already running.
>>             >>
>>             >> v2:
>>             >> Stop all schedulers across device and entire XGMI hive
>>             before
>>             >> force signaling HW fences.
>>             >> Avoid passing job_signaled to helper fnctions to keep
>>             all the decision
>>             >> making about skipping HW reset in one place.
>>             >>
>>             >> v3:
>>             >> Fix SW sched. hang after non HW reset.
>>             sched.hw_rq_count has to be balanced
>>             >> against it's decrement in drm_sched_stop in non HW
>>             reset case.
>>             >> v4: rebase
>>             >> v5: Revert v3 as we do it now in sceduler code.
>>             >>
>>             >> Signed-off-by: Andrey Grodzovsky
>>             <andrey.grodzovsky@amd.com>
>>             <mailto:andrey.grodzovsky@amd.com>
>>             >> ---
>>             >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143
>>             +++++++++++++++++++----------
>>             >>    1 file changed, 95 insertions(+), 48 deletions(-)
>>             >>
>>             >> diff --git
>>             a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>             b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>             >> index a0e165c..85f8792 100644
>>             >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>             >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>             >> @@ -3334,8 +3334,6 @@ static int
>>             amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>             >>               if (!ring || !ring->sched.thread)
>>             >>                       continue;
>>             >>
>>             >> - drm_sched_stop(&ring->sched, &job->base);
>>             >> -
>>             >>               /* after all hw jobs are reset, hw fence
>>             is meaningless, so force_completion */
>>             >> amdgpu_fence_driver_force_completion(ring);
>>             >>       }
>>             >> @@ -3343,6 +3341,7 @@ static int
>>             amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>             >>       if(job)
>>             >> drm_sched_increase_karma(&job->base);
>>             >>
>>             >> +    /* Don't suspend on bare metal if we are not
>>             going to HW reset the ASIC */
>>             >>       if (!amdgpu_sriov_vf(adev)) {
>>             >>
>>             >>               if (!need_full_reset)
>>             >> @@ -3480,37 +3479,21 @@ static int
>>             amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>             >>       return r;
>>             >>    }
>>             >>
>>             >> -static void amdgpu_device_post_asic_reset(struct
>>             amdgpu_device *adev)
>>             >> +static bool amdgpu_device_lock_adev(struct
>>             amdgpu_device *adev, bool trylock)
>>             >>    {
>>             >> -    int i;
>>             >> -
>>             >> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>             >> -            struct amdgpu_ring *ring = adev->rings[i];
>>             >> -
>>             >> -            if (!ring || !ring->sched.thread)
>>             >> -                    continue;
>>             >> -
>>             >> -            if (!adev->asic_reset_res)
>>             >> - drm_sched_resubmit_jobs(&ring->sched);
>>             >> +    if (trylock) {
>>             >> +            if (!mutex_trylock(&adev->lock_reset))
>>             >> +                    return false;
>>             >> +    } else
>>             >> + mutex_lock(&adev->lock_reset);
>>             >>
>>             >> - drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>             >> -    }
>>             >> -
>>             >> -    if (!amdgpu_device_has_dc_support(adev)) {
>>             >> - drm_helper_resume_force_mode(adev->ddev);
>>             >> -    }
>>             >> -
>>             >> -    adev->asic_reset_res = 0;
>>             >> -}
>>             >> -
>>             >> -static void amdgpu_device_lock_adev(struct
>>             amdgpu_device *adev)
>>             >> -{
>>             >> - mutex_lock(&adev->lock_reset);
>>             >> atomic_inc(&adev->gpu_reset_counter);
>>             >>       adev->in_gpu_reset = 1;
>>             >>       /* Block kfd: SRIOV would do it separately */
>>             >>       if (!amdgpu_sriov_vf(adev))
>>             >> amdgpu_amdkfd_pre_reset(adev);
>>             >> +
>>             >> +    return true;
>>             >>    }
>>             >>
>>             >>    static void amdgpu_device_unlock_adev(struct
>>             amdgpu_device *adev)
>>             >> @@ -3538,40 +3521,42 @@ static void
>>             amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>             >>    int amdgpu_device_gpu_recover(struct amdgpu_device
>>             *adev,
>>             >>                             struct amdgpu_job *job)
>>             >>    {
>>             >> -    int r;
>>             >> +    struct list_head device_list, *device_list_handle
>>             =  NULL;
>>             >> +    bool need_full_reset, job_signaled;
>>             >>       struct amdgpu_hive_info *hive = NULL;
>>             >> -    bool need_full_reset = false;
>>             >>       struct amdgpu_device *tmp_adev = NULL;
>>             >> -    struct list_head device_list, *device_list_handle
>>             =  NULL;
>>             >> +    int i, r = 0;
>>             >>
>>             >> +    need_full_reset = job_signaled = false;
>>             >>       INIT_LIST_HEAD(&device_list);
>>             >>
>>             >>       dev_info(adev->dev, "GPU reset begin!\n");
>>             >>
>>             >> +    hive = amdgpu_get_xgmi_hive(adev, false);
>>             >> +
>>             >>       /*
>>             >> -     * In case of XGMI hive disallow concurrent
>>             resets to be triggered
>>             >> -     * by different nodes. No point also since the
>>             one node already executing
>>             >> -     * reset will also reset all the other nodes in
>>             the hive.
>>             >> +     * Here we trylock to avoid chain of resets
>>             executing from
>>             >> +     * either trigger by jobs on different adevs in
>>             XGMI hive or jobs on
>>             >> +     * different schedulers for same device while
>>             this TO handler is running.
>>             >> +     * We always reset all schedulers for device and
>>             all devices for XGMI
>>             >> +     * hive so that should take care of them too.
>>             >>        */
>>             >> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>>             >> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>             >> - !mutex_trylock(&hive->reset_lock))
>>             >> +
>>             >> +    if (hive && !mutex_trylock(&hive->reset_lock)) {
>>             >> +            DRM_INFO("Bailing on TDR for s_job:%llx,
>>             hive: %llx as another already in progress",
>>             >> +                     job->base.id, hive->hive_id);
>>             >>               return 0;
>>             >> +    }
>>             >>
>>             >>       /* Start with adev pre asic reset first for soft
>>             reset check.*/
>>             >> -    amdgpu_device_lock_adev(adev);
>>             >> -    r = amdgpu_device_pre_asic_reset(adev,
>>             >> - job,
>>             >> - &need_full_reset);
>>             >> -    if (r) {
>>             >> -            /*TODO Should we stop ?*/
>>             >> -            DRM_ERROR("GPU pre asic reset failed with
>>             err, %d for drm dev, %s ",
>>             >> -                      r, adev->ddev->unique);
>>             >> -            adev->asic_reset_res = r;
>>             >> +    if (!amdgpu_device_lock_adev(adev, !hive)) {
>>             >> +            DRM_INFO("Bailing on TDR for s_job:%llx,
>>             as another already in progress",
>>             >> + job->base.id);
>>             >> +            return 0;
>>             >>       }
>>             >>
>>             >>       /* Build list of devices to reset */
>>             >> -    if  (need_full_reset &&
>>             adev->gmc.xgmi.num_physical_nodes > 1) {
>>             >> +    if (adev->gmc.xgmi.num_physical_nodes > 1) {
>>             >>               if (!hive) {
>>             >> amdgpu_device_unlock_adev(adev);
>>             >>                       return -ENODEV;
>>             >> @@ -3588,13 +3573,56 @@ int
>>             amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>             >>               device_list_handle = &device_list;
>>             >>       }
>>             >>
>>             >> +    /* block all schedulers and reset given job's ring */
>>             >> +    list_for_each_entry(tmp_adev, device_list_handle,
>>             gmc.xgmi.head) {
>>             >> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>             >> +                    struct amdgpu_ring *ring =
>>             tmp_adev->rings[i];
>>             >> +
>>             >> +                    if (!ring || !ring->sched.thread)
>>             >> +                            continue;
>>             >> +
>>             >> + drm_sched_stop(&ring->sched, &job->base);
>>             >> +            }
>>             >> +    }
>>             >> +
>>             >> +
>>             >> +    /*
>>             >> +     * Must check guilty signal here since after this
>>             point all old
>>             >> +     * HW fences are force signaled.
>>             >> +     *
>>             >> +     * job->base holds a reference to parent fence
>>             >> +     */
>>             >> +    if (job && job->base.s_fence->parent &&
>>             >> + dma_fence_is_signaled(job->base.s_fence->parent))
>>             >> +            job_signaled = true;
>>             >> +
>>             >> +    if (!amdgpu_device_ip_need_full_reset(adev))
>>             >> +            device_list_handle = &device_list;
>>             >> +
>>             >> +    if (job_signaled) {
>>             >> +            dev_info(adev->dev, "Guilty job already
>>             signaled, skipping HW reset");
>>             >> +            goto skip_hw_reset;
>>             >> +    }
>>             >> +
>>             >> +
>>             >> +    /* Guilty job will be freed after this*/
>>             >> +    r = amdgpu_device_pre_asic_reset(adev,
>>             >> + job,
>>             >> + &need_full_reset);
>>             >> +    if (r) {
>>             >> +            /*TODO Should we stop ?*/
>>             >> +            DRM_ERROR("GPU pre asic reset failed with
>>             err, %d for drm dev, %s ",
>>             >> +                      r, adev->ddev->unique);
>>             >> +            adev->asic_reset_res = r;
>>             >> +    }
>>             >> +
>>             >>    retry:    /* Rest of adevs pre asic reset from XGMI
>>             hive. */
>>             >>       list_for_each_entry(tmp_adev,
>>             device_list_handle, gmc.xgmi.head) {
>>             >>
>>             >>               if (tmp_adev == adev)
>>             >>                       continue;
>>             >>
>>             >> - amdgpu_device_lock_adev(tmp_adev);
>>             >> + amdgpu_device_lock_adev(tmp_adev, false);
>>             >>               r = amdgpu_device_pre_asic_reset(tmp_adev,
>>             >>                                                NULL,
>>             >> &need_full_reset);
>>             >> @@ -3618,9 +3646,28 @@ int
>>             amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>             >>                       goto retry;
>>             >>       }
>>             >>
>>             >> +skip_hw_reset:
>>             >> +
>>             >>       /* Post ASIC reset for all devs .*/
>>             >>       list_for_each_entry(tmp_adev,
>>             device_list_handle, gmc.xgmi.head) {
>>             >> - amdgpu_device_post_asic_reset(tmp_adev);
>>             >> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>             >> +                    struct amdgpu_ring *ring =
>>             tmp_adev->rings[i];
>>             >> +
>>             >> +                    if (!ring || !ring->sched.thread)
>>             >> +                            continue;
>>             >> +
>>             >> +                    /* No point to resubmit jobs if
>>             we didn't HW reset*/
>>             >> +                    if (!tmp_adev->asic_reset_res &&
>>             !job_signaled)
>>             >> + drm_sched_resubmit_jobs(&ring->sched);
>>             >> +
>>             >> + drm_sched_start(&ring->sched,
>>             !tmp_adev->asic_reset_res);
>>             >> +            }
>>             >> +
>>             >> +            if
>>             (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>             >> + drm_helper_resume_force_mode(tmp_adev->ddev);
>>             >> +            }
>>             >> +
>>             >> +            tmp_adev->asic_reset_res = 0;
>>             >>
>>             >>               if (r) {
>>             >>                       /* bad news, how to tell it to
>>             userspace ? */
>>             >> @@ -3633,7 +3680,7 @@ int
>>             amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>             >> amdgpu_device_unlock_adev(tmp_adev);
>>             >>       }
>>             >>
>>             >> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>             >> +    if (hive)
>>             >> mutex_unlock(&hive->reset_lock);
>>             >>
>>             >>       if (r)
>>             _______________________________________________
>>             amd-gfx mailing list
>>             amd-gfx@lists.freedesktop.org
>>             <mailto:amd-gfx@lists.freedesktop.org>
>>             https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">I would clean them up further, but
      that's only moving code around so feel free to add my rb to those.<br>
      <br>
      Christian.<br>
      <br>
      Am 29.04.19 um 16:14 schrieb Grodzovsky, Andrey:<br>
    </div>
    <blockquote type="cite"
      cite="mid:effe0c0d-ef8d-898b-6be9-e3728b14b0af@amd.com">
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      <p>Thanks David, with that only patches 5 and 6 are left for the
        series to be reviewed.
        <br>
      </p>
      <p>Christian, any more comments on those patches ?</p>
      <p>Andrey<br>
      </p>
      <div class="moz-cite-prefix">On 4/27/19 10:56 PM, Zhou,
        David(ChunMing) wrote:<br>
      </div>
      <blockquote type="cite"
cite="mid:MN2PR12MB2910903F08BF48360A6D5EEFB4380@MN2PR12MB2910.namprd12.prod.outlook.com">
        <meta name="Generator" content="Microsoft Word 15 (filtered
          medium)">
        <style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:DengXian;
	panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
	{font-family:"Microsoft YaHei";
	panose-1:2 11 5 3 2 2 4 2 2 4;}
@font-face
	{font-family:"\@Microsoft YaHei";}
@font-face
	{font-family:"\@DengXian";
	panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0in;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:blue;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:purple;
	text-decoration:underline;}
p.msonormal0, li.msonormal0, div.msonormal0
	{mso-style-name:msonormal;
	mso-margin-top-alt:auto;
	margin-right:0in;
	mso-margin-bottom-alt:auto;
	margin-left:0in;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
p.emailquote, li.emailquote, div.emailquote
	{mso-style-name:emailquote;
	mso-margin-top-alt:auto;
	margin-right:0in;
	mso-margin-bottom-alt:auto;
	margin-left:1.0pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	color:black;}
span.EmailStyle20
	{mso-style-type:personal;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
span.EmailStyle21
	{mso-style-type:personal-reply;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-size:10.0pt;}
@page WordSection1
	{size:8.5in 11.0in;
	margin:1.0in 1.25in 1.0in 1.25in;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
        <div class="WordSection1">
          <p class="MsoNormal"><span style="color:windowtext">Sorry, I
              only can put my Acked-by: Chunming Zhou
              <a class="moz-txt-link-rfc2396E"
                href="mailto:david1.zhou@amd.com" moz-do-not-send="true">&lt;david1.zhou@amd.com&gt;</a>
              on patch#3.<o:p></o:p></span></p>
          <p class="MsoNormal"><span style="color:windowtext"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span style="color:windowtext">I cannot
              fully judge patch #4, #5, #6.<o:p></o:p></span></p>
          <p class="MsoNormal"><span style="color:windowtext"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span style="color:windowtext">-David<o:p></o:p></span></p>
          <p class="MsoNormal"><span style="color:windowtext"><o:p> </o:p></span></p>
          <div>
            <div style="border:none;border-top:solid #E1E1E1
              1.0pt;padding:3.0pt 0in 0in 0in">
              <p class="MsoNormal"><b><span style="color:windowtext">From:</span></b><span
                  style="color:windowtext"> amd-gfx
                  <a class="moz-txt-link-rfc2396E"
                    href="mailto:amd-gfx-bounces@lists.freedesktop.org"
                    moz-do-not-send="true">
                    &lt;amd-gfx-bounces@lists.freedesktop.org&gt;</a> <b>On
                    Behalf Of </b>Grodzovsky, Andrey<br>
                  <b>Sent:</b> Friday, April 26, 2019 10:09 PM<br>
                  <b>To:</b> Koenig, Christian <a
                    class="moz-txt-link-rfc2396E"
                    href="mailto:Christian.Koenig@amd.com"
                    moz-do-not-send="true">
                    &lt;Christian.Koenig@amd.com&gt;</a>; Zhou,
                  David(ChunMing) <a class="moz-txt-link-rfc2396E"
                    href="mailto:David1.Zhou@amd.com"
                    moz-do-not-send="true">
                    &lt;David1.Zhou@amd.com&gt;</a>; <a
                    class="moz-txt-link-abbreviated"
                    href="mailto:dri-devel@lists.freedesktop.org"
                    moz-do-not-send="true">
                    dri-devel@lists.freedesktop.org</a>; <a
                    class="moz-txt-link-abbreviated"
                    href="mailto:amd-gfx@lists.freedesktop.org"
                    moz-do-not-send="true">
                    amd-gfx@lists.freedesktop.org</a>; <a
                    class="moz-txt-link-abbreviated"
                    href="mailto:eric@anholt.net" moz-do-not-send="true">
                    eric@anholt.net</a>; <a
                    class="moz-txt-link-abbreviated"
                    href="mailto:etnaviv@lists.freedesktop.org"
                    moz-do-not-send="true">
                    etnaviv@lists.freedesktop.org</a><br>
                  <b>Cc:</b> Kazlauskas, Nicholas <a
                    class="moz-txt-link-rfc2396E"
                    href="mailto:Nicholas.Kazlauskas@amd.com"
                    moz-do-not-send="true">
                    &lt;Nicholas.Kazlauskas@amd.com&gt;</a>; Liu, Monk <a
                    class="moz-txt-link-rfc2396E"
                    href="mailto:Monk.Liu@amd.com"
                    moz-do-not-send="true">
                    &lt;Monk.Liu@amd.com&gt;</a><br>
                  <b>Subject:</b> Re: [PATCH v5 6/6] drm/amdgpu: Avoid
                  HW reset if guilty job already signaled.<o:p></o:p></span></p>
            </div>
          </div>
          <p class="MsoNormal"><o:p> </o:p></p>
          <p>Ping (mostly David and Monk).<o:p></o:p></p>
          <p>Andrey<o:p></o:p></p>
          <div>
            <p class="MsoNormal">On 4/24/19 3:09 AM, Christian König
              wrote:<o:p></o:p></p>
          </div>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <div>
              <p class="MsoNormal">Am 24.04.19 um 05:02 schrieb Zhou,
                David(ChunMing):<o:p></o:p></p>
            </div>
            <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
              <p class="MsoNormal">&gt;&gt; -           
                drm_sched_stop(&amp;ring-&gt;sched, &amp;job-&gt;base);<br>
                &gt;&gt; -<br>
                &gt;&gt;               /* after all hw jobs are reset,
                hw fence is meaningless, so force_completion */<br>
                &gt;&gt;              
                amdgpu_fence_driver_force_completion(ring);<br>
                &gt;&gt;       }<o:p></o:p></p>
              <p class="MsoNormal"> <o:p></o:p></p>
              <p class="MsoNormal">HW fence are already forced
                completion, then we can just disable irq fence process
                and ignore hw fence signal when we are trying to do GPU
                reset, I think. Otherwise which will make the logic much
                more complex.<o:p></o:p></p>
              <p class="MsoNormal"><span style="color:windowtext">If
                  this situation happens because of long time execution,
                  we can increase timeout of reset detection.</span><o:p></o:p></p>
            </blockquote>
            <p class="MsoNormal"><br>
              You are not thinking widely enough, forcing the hw fence
              to complete can trigger other to start other activity in
              the system.<br>
              <br>
              We first need to stop everything and make sure that we
              don't do any processing any more and then start with our
              reset procedure including forcing all hw fences to
              complete.<br>
              <br>
              Christian.<br>
              <br>
              <br>
              <o:p></o:p></p>
            <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
              <p class="MsoNormal"><span style="color:windowtext"> </span><o:p></o:p></p>
              <p class="MsoNormal"><span style="color:windowtext">-David</span><o:p></o:p></p>
              <p class="MsoNormal"><span style="color:windowtext"> </span><o:p></o:p></p>
              <div style="border:none;border-left:solid blue
                1.5pt;padding:0in 0in 0in 4.0pt">
                <div>
                  <div style="border:none;border-top:solid #E1E1E1
                    1.0pt;padding:3.0pt 0in 0in 0in">
                    <p class="MsoNormal"><b><span
                          style="color:windowtext">From:</span></b><span
                        style="color:windowtext"> amd-gfx
                        <a
                          href="mailto:amd-gfx-bounces@lists.freedesktop.org"
                          moz-do-not-send="true">&lt;amd-gfx-bounces@lists.freedesktop.org&gt;</a>
                        <b>On Behalf Of </b>Grodzovsky, Andrey<br>
                        <b>Sent:</b> Wednesday, April 24, 2019 12:00 AM<br>
                        <b>To:</b> Zhou, David(ChunMing) <a
                          href="mailto:David1.Zhou@amd.com"
                          moz-do-not-send="true">
                          &lt;David1.Zhou@amd.com&gt;</a>; <a
                          href="mailto:dri-devel@lists.freedesktop.org"
                          moz-do-not-send="true">
                          dri-devel@lists.freedesktop.org</a>; <a
                          href="mailto:amd-gfx@lists.freedesktop.org"
                          moz-do-not-send="true">
                          amd-gfx@lists.freedesktop.org</a>; <a
                          href="mailto:eric@anholt.net"
                          moz-do-not-send="true">
                          eric@anholt.net</a>; <a
                          href="mailto:etnaviv@lists.freedesktop.org"
                          moz-do-not-send="true">
                          etnaviv@lists.freedesktop.org</a>; <a
                          href="mailto:ckoenig.leichtzumerken@gmail.com"
                          moz-do-not-send="true">
                          ckoenig.leichtzumerken@gmail.com</a><br>
                        <b>Cc:</b> Kazlauskas, Nicholas <a
                          href="mailto:Nicholas.Kazlauskas@amd.com"
                          moz-do-not-send="true">
                          &lt;Nicholas.Kazlauskas@amd.com&gt;</a>; Liu,
                        Monk <a href="mailto:Monk.Liu@amd.com"
                          moz-do-not-send="true">
                          &lt;Monk.Liu@amd.com&gt;</a><br>
                        <b>Subject:</b> Re: [PATCH v5 6/6] drm/amdgpu:
                        Avoid HW reset if guilty job already signaled.</span><o:p></o:p></p>
                  </div>
                </div>
                <p class="MsoNormal"> <o:p></o:p></p>
                <p>No, i mean the actual HW fence which signals when the
                  job finished execution on the HW.<o:p></o:p></p>
                <p>Andrey<o:p></o:p></p>
                <div>
                  <p class="MsoNormal">On 4/23/19 11:19 AM, Zhou,
                    David(ChunMing) wrote:<o:p></o:p></p>
                </div>
                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
                  <div>
                    <p class="MsoNormal" style="margin-bottom:12.0pt">do
                      you mean fence timer? why not stop it as well when
                      stopping sched for the reason of hw reset?<br>
                      <br>
                      -------- Original Message --------<br>
                      Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW
                      reset if guilty job already signaled.<br>
                      From: "Grodzovsky, Andrey" <br>
                      To: "Zhou, David(ChunMing)" ,<a
href="mailto:dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com"
                        moz-do-not-send="true">dri-devel@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,eric@anholt.net,etnaviv@lists.freedesktop.org,ckoenig.leichtzumerken@gmail.com</a><br>
                      CC: "Kazlauskas, Nicholas" ,"Liu, Monk" <o:p></o:p></p>
                  </div>
                  <div>
                    <p class="MsoNormal"><br>
                      On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:<br>
                      &gt; +Monk.<br>
                      &gt;<br>
                      &gt; GPU reset is used widely in SRIOV, so need
                      virtulizatino guy take a look.<br>
                      &gt;<br>
                      &gt; But out of curious, why guilty job can signal
                      more if the job is already<br>
                      &gt; set to guilty? set it wrongly?<br>
                      &gt;<br>
                      &gt;<br>
                      &gt; -David<br>
                      <br>
                      <br>
                      It's possible that the job does completes at a
                      later time then it's <br>
                      timeout handler started processing so in this
                      patch we try to protect <br>
                      against this by rechecking the HW fence after
                      stopping all SW <br>
                      schedulers. We do it BEFORE marking guilty on the
                      job's sched_entity so <br>
                      at the point we check the guilty flag is not set
                      yet.<br>
                      <br>
                      Andrey<br>
                      <br>
                      <br>
                      &gt;<br>
                      &gt; <span style="font-family:&quot;Microsoft
                        YaHei&quot;,sans-serif" lang="ZH-CN">
                        在</span> 2019/4/18 23:00, Andrey Grodzovsky <span
                        style="font-family:&quot;Microsoft
                        YaHei&quot;,sans-serif" lang="ZH-CN">
                        写道</span>:<br>
                      &gt;&gt; Also reject TDRs if another one already
                      running.<br>
                      &gt;&gt;<br>
                      &gt;&gt; v2:<br>
                      &gt;&gt; Stop all schedulers across device and
                      entire XGMI hive before<br>
                      &gt;&gt; force signaling HW fences.<br>
                      &gt;&gt; Avoid passing job_signaled to helper
                      fnctions to keep all the decision<br>
                      &gt;&gt; making about skipping HW reset in one
                      place.<br>
                      &gt;&gt;<br>
                      &gt;&gt; v3:<br>
                      &gt;&gt; Fix SW sched. hang after non HW reset.
                      sched.hw_rq_count has to be balanced<br>
                      &gt;&gt; against it's decrement in drm_sched_stop
                      in non HW reset case.<br>
                      &gt;&gt; v4: rebase<br>
                      &gt;&gt; v5: Revert v3 as we do it now in sceduler
                      code.<br>
                      &gt;&gt;<br>
                      &gt;&gt; Signed-off-by: Andrey Grodzovsky <a
                        href="mailto:andrey.grodzovsky@amd.com"
                        moz-do-not-send="true">
                        &lt;andrey.grodzovsky@amd.com&gt;</a><br>
                      &gt;&gt; ---<br>
                      &gt;&gt;   
                      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143
                      +++++++++++++++++++----------<br>
                      &gt;&gt;    1 file changed, 95 insertions(+), 48
                      deletions(-)<br>
                      &gt;&gt;<br>
                      &gt;&gt; diff --git
                      a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
                      b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                      &gt;&gt; index a0e165c..85f8792 100644<br>
                      &gt;&gt; ---
                      a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                      &gt;&gt; +++
                      b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                      &gt;&gt; @@ -3334,8 +3334,6 @@ static int
                      amdgpu_device_pre_asic_reset(struct amdgpu_device
                      *adev,<br>
                      &gt;&gt;               if (!ring ||
                      !ring-&gt;sched.thread)<br>
                      &gt;&gt;                       continue;<br>
                      &gt;&gt;    <br>
                      &gt;&gt; -           
                      drm_sched_stop(&amp;ring-&gt;sched,
                      &amp;job-&gt;base);<br>
                      &gt;&gt; -<br>
                      &gt;&gt;               /* after all hw jobs are
                      reset, hw fence is meaningless, so
                      force_completion */<br>
                      &gt;&gt;              
                      amdgpu_fence_driver_force_completion(ring);<br>
                      &gt;&gt;       }<br>
                      &gt;&gt; @@ -3343,6 +3341,7 @@ static int
                      amdgpu_device_pre_asic_reset(struct amdgpu_device
                      *adev,<br>
                      &gt;&gt;       if(job)<br>
                      &gt;&gt;              
                      drm_sched_increase_karma(&amp;job-&gt;base);<br>
                      &gt;&gt;    <br>
                      &gt;&gt; +    /* Don't suspend on bare metal if we
                      are not going to HW reset the ASIC */<br>
                      &gt;&gt;       if (!amdgpu_sriov_vf(adev)) {<br>
                      &gt;&gt;    <br>
                      &gt;&gt;               if (!need_full_reset)<br>
                      &gt;&gt; @@ -3480,37 +3479,21 @@ static int
                      amdgpu_do_asic_reset(struct amdgpu_hive_info
                      *hive,<br>
                      &gt;&gt;       return r;<br>
                      &gt;&gt;    }<br>
                      &gt;&gt;    <br>
                      &gt;&gt; -static void
                      amdgpu_device_post_asic_reset(struct amdgpu_device
                      *adev)<br>
                      &gt;&gt; +static bool
                      amdgpu_device_lock_adev(struct amdgpu_device
                      *adev, bool trylock)<br>
                      &gt;&gt;    {<br>
                      &gt;&gt; -    int i;<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -    for (i = 0; i &lt; AMDGPU_MAX_RINGS;
                      ++i) {<br>
                      &gt;&gt; -            struct amdgpu_ring *ring =
                      adev-&gt;rings[i];<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -            if (!ring ||
                      !ring-&gt;sched.thread)<br>
                      &gt;&gt; -                    continue;<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -            if
                      (!adev-&gt;asic_reset_res)<br>
                      &gt;&gt; -                   
                      drm_sched_resubmit_jobs(&amp;ring-&gt;sched);<br>
                      &gt;&gt; +    if (trylock) {<br>
                      &gt;&gt; +            if
                      (!mutex_trylock(&amp;adev-&gt;lock_reset))<br>
                      &gt;&gt; +                    return false;<br>
                      &gt;&gt; +    } else<br>
                      &gt;&gt; +           
                      mutex_lock(&amp;adev-&gt;lock_reset);<br>
                      &gt;&gt;    <br>
                      &gt;&gt; -           
                      drm_sched_start(&amp;ring-&gt;sched,
                      !adev-&gt;asic_reset_res);<br>
                      &gt;&gt; -    }<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -    if
                      (!amdgpu_device_has_dc_support(adev)) {<br>
                      &gt;&gt; -           
                      drm_helper_resume_force_mode(adev-&gt;ddev);<br>
                      &gt;&gt; -    }<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -    adev-&gt;asic_reset_res = 0;<br>
                      &gt;&gt; -}<br>
                      &gt;&gt; -<br>
                      &gt;&gt; -static void
                      amdgpu_device_lock_adev(struct amdgpu_device
                      *adev)<br>
                      &gt;&gt; -{<br>
                      &gt;&gt; -   
                      mutex_lock(&amp;adev-&gt;lock_reset);<br>
                      &gt;&gt;      
                      atomic_inc(&amp;adev-&gt;gpu_reset_counter);<br>
                      &gt;&gt;       adev-&gt;in_gpu_reset = 1;<br>
                      &gt;&gt;       /* Block kfd: SRIOV would do it
                      separately */<br>
                      &gt;&gt;       if (!amdgpu_sriov_vf(adev))<br>
                      &gt;&gt;                   
                      amdgpu_amdkfd_pre_reset(adev);<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    return true;<br>
                      &gt;&gt;    }<br>
                      &gt;&gt;    <br>
                      &gt;&gt;    static void
                      amdgpu_device_unlock_adev(struct amdgpu_device
                      *adev)<br>
                      &gt;&gt; @@ -3538,40 +3521,42 @@ static void
                      amdgpu_device_unlock_adev(struct amdgpu_device
                      *adev)<br>
                      &gt;&gt;    int amdgpu_device_gpu_recover(struct
                      amdgpu_device *adev,<br>
                      &gt;&gt;                             struct
                      amdgpu_job *job)<br>
                      &gt;&gt;    {<br>
                      &gt;&gt; -    int r;<br>
                      &gt;&gt; +    struct list_head device_list,
                      *device_list_handle =  NULL;<br>
                      &gt;&gt; +    bool need_full_reset, job_signaled;<br>
                      &gt;&gt;       struct amdgpu_hive_info *hive =
                      NULL;<br>
                      &gt;&gt; -    bool need_full_reset = false;<br>
                      &gt;&gt;       struct amdgpu_device *tmp_adev =
                      NULL;<br>
                      &gt;&gt; -    struct list_head device_list,
                      *device_list_handle =  NULL;<br>
                      &gt;&gt; +    int i, r = 0;<br>
                      &gt;&gt;    <br>
                      &gt;&gt; +    need_full_reset = job_signaled =
                      false;<br>
                      &gt;&gt;       INIT_LIST_HEAD(&amp;device_list);<br>
                      &gt;&gt;    <br>
                      &gt;&gt;       dev_info(adev-&gt;dev, "GPU reset
                      begin!\n");<br>
                      &gt;&gt;    <br>
                      &gt;&gt; +    hive = amdgpu_get_xgmi_hive(adev,
                      false);<br>
                      &gt;&gt; +<br>
                      &gt;&gt;       /*<br>
                      &gt;&gt; -     * In case of XGMI hive disallow
                      concurrent resets to be triggered<br>
                      &gt;&gt; -     * by different nodes. No point also
                      since the one node already executing<br>
                      &gt;&gt; -     * reset will also reset all the
                      other nodes in the hive.<br>
                      &gt;&gt; +     * Here we trylock to avoid chain of
                      resets executing from<br>
                      &gt;&gt; +     * either trigger by jobs on
                      different adevs in XGMI hive or jobs on<br>
                      &gt;&gt; +     * different schedulers for same
                      device while this TO handler is running.<br>
                      &gt;&gt; +     * We always reset all schedulers
                      for device and all devices for XGMI<br>
                      &gt;&gt; +     * hive so that should take care of
                      them too.<br>
                      &gt;&gt;        */<br>
                      &gt;&gt; -    hive = amdgpu_get_xgmi_hive(adev,
                      0);<br>
                      &gt;&gt; -    if (hive &amp;&amp;
                      adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1
                      &amp;&amp;<br>
                      &gt;&gt; -       
                      !mutex_trylock(&amp;hive-&gt;reset_lock))<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    if (hive &amp;&amp;
                      !mutex_trylock(&amp;hive-&gt;reset_lock)) {<br>
                      &gt;&gt; +            DRM_INFO("Bailing on TDR for
                      s_job:%llx, hive: %llx as another already in
                      progress",<br>
                      &gt;&gt; +                     job-&gt;base.id,
                      hive-&gt;hive_id);<br>
                      &gt;&gt;               return 0;<br>
                      &gt;&gt; +    }<br>
                      &gt;&gt;    <br>
                      &gt;&gt;       /* Start with adev pre asic reset
                      first for soft reset check.*/<br>
                      &gt;&gt; -    amdgpu_device_lock_adev(adev);<br>
                      &gt;&gt; -    r =
                      amdgpu_device_pre_asic_reset(adev,<br>
                      &gt;&gt; -                                    
                      job,<br>
                      &gt;&gt; -                                    
                      &amp;need_full_reset);<br>
                      &gt;&gt; -    if (r) {<br>
                      &gt;&gt; -            /*TODO Should we stop ?*/<br>
                      &gt;&gt; -            DRM_ERROR("GPU pre asic
                      reset failed with err, %d for drm dev, %s ",<br>
                      &gt;&gt; -                      r,
                      adev-&gt;ddev-&gt;unique);<br>
                      &gt;&gt; -            adev-&gt;asic_reset_res = r;<br>
                      &gt;&gt; +    if (!amdgpu_device_lock_adev(adev,
                      !hive)) {<br>
                      &gt;&gt; +            DRM_INFO("Bailing on TDR for
                      s_job:%llx, as another already in progress",<br>
                      &gt;&gt; +                                    
                      job-&gt;base.id);<br>
                      &gt;&gt; +            return 0;<br>
                      &gt;&gt;       }<br>
                      &gt;&gt;    <br>
                      &gt;&gt;       /* Build list of devices to reset
                      */<br>
                      &gt;&gt; -    if  (need_full_reset &amp;&amp;
                      adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1) {<br>
                      &gt;&gt; +    if 
                      (adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1) {<br>
                      &gt;&gt;               if (!hive) {<br>
                      &gt;&gt;                      
                      amdgpu_device_unlock_adev(adev);<br>
                      &gt;&gt;                       return -ENODEV;<br>
                      &gt;&gt; @@ -3588,13 +3573,56 @@ int
                      amdgpu_device_gpu_recover(struct amdgpu_device
                      *adev,<br>
                      &gt;&gt;               device_list_handle =
                      &amp;device_list;<br>
                      &gt;&gt;       }<br>
                      &gt;&gt;    <br>
                      &gt;&gt; +    /* block all schedulers and reset
                      given job's ring */<br>
                      &gt;&gt; +    list_for_each_entry(tmp_adev,
                      device_list_handle, gmc.xgmi.head) {<br>
                      &gt;&gt; +            for (i = 0; i &lt;
                      AMDGPU_MAX_RINGS; ++i) {<br>
                      &gt;&gt; +                    struct amdgpu_ring
                      *ring = tmp_adev-&gt;rings[i];<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +                    if (!ring ||
                      !ring-&gt;sched.thread)<br>
                      &gt;&gt; +                            continue;<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +                   
                      drm_sched_stop(&amp;ring-&gt;sched,
                      &amp;job-&gt;base);<br>
                      &gt;&gt; +            }<br>
                      &gt;&gt; +    }<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    /*<br>
                      &gt;&gt; +     * Must check guilty signal here
                      since after this point all old<br>
                      &gt;&gt; +     * HW fences are force signaled.<br>
                      &gt;&gt; +     *<br>
                      &gt;&gt; +     * job-&gt;base holds a reference to
                      parent fence<br>
                      &gt;&gt; +     */<br>
                      &gt;&gt; +    if (job &amp;&amp;
                      job-&gt;base.s_fence-&gt;parent &amp;&amp;<br>
                      &gt;&gt; +       
                      dma_fence_is_signaled(job-&gt;base.s_fence-&gt;parent))<br>
                      &gt;&gt; +            job_signaled = true;<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    if
                      (!amdgpu_device_ip_need_full_reset(adev))<br>
                      &gt;&gt; +            device_list_handle =
                      &amp;device_list;<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    if (job_signaled) {<br>
                      &gt;&gt; +            dev_info(adev-&gt;dev,
                      "Guilty job already signaled, skipping HW reset");<br>
                      &gt;&gt; +            goto skip_hw_reset;<br>
                      &gt;&gt; +    }<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +    /* Guilty job will be freed after
                      this*/<br>
                      &gt;&gt; +    r =
                      amdgpu_device_pre_asic_reset(adev,<br>
                      &gt;&gt; +                                    
                      job,<br>
                      &gt;&gt; +                                    
                      &amp;need_full_reset);<br>
                      &gt;&gt; +    if (r) {<br>
                      &gt;&gt; +            /*TODO Should we stop ?*/<br>
                      &gt;&gt; +            DRM_ERROR("GPU pre asic
                      reset failed with err, %d for drm dev, %s ",<br>
                      &gt;&gt; +                      r,
                      adev-&gt;ddev-&gt;unique);<br>
                      &gt;&gt; +            adev-&gt;asic_reset_res = r;<br>
                      &gt;&gt; +    }<br>
                      &gt;&gt; +<br>
                      &gt;&gt;    retry:    /* Rest of adevs pre asic
                      reset from XGMI hive. */<br>
                      &gt;&gt;       list_for_each_entry(tmp_adev,
                      device_list_handle, gmc.xgmi.head) {<br>
                      &gt;&gt;    <br>
                      &gt;&gt;               if (tmp_adev == adev)<br>
                      &gt;&gt;                       continue;<br>
                      &gt;&gt;    <br>
                      &gt;&gt; -           
                      amdgpu_device_lock_adev(tmp_adev);<br>
                      &gt;&gt; +           
                      amdgpu_device_lock_adev(tmp_adev, false);<br>
                      &gt;&gt;               r =
                      amdgpu_device_pre_asic_reset(tmp_adev,<br>
&gt;&gt;                                                NULL,<br>
&gt;&gt;                                               
                      &amp;need_full_reset);<br>
                      &gt;&gt; @@ -3618,9 +3646,28 @@ int
                      amdgpu_device_gpu_recover(struct amdgpu_device
                      *adev,<br>
                      &gt;&gt;                       goto retry;<br>
                      &gt;&gt;       }<br>
                      &gt;&gt;    <br>
                      &gt;&gt; +skip_hw_reset:<br>
                      &gt;&gt; +<br>
                      &gt;&gt;       /* Post ASIC reset for all devs .*/<br>
                      &gt;&gt;       list_for_each_entry(tmp_adev,
                      device_list_handle, gmc.xgmi.head) {<br>
                      &gt;&gt; -           
                      amdgpu_device_post_asic_reset(tmp_adev);<br>
                      &gt;&gt; +            for (i = 0; i &lt;
                      AMDGPU_MAX_RINGS; ++i) {<br>
                      &gt;&gt; +                    struct amdgpu_ring
                      *ring = tmp_adev-&gt;rings[i];<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +                    if (!ring ||
                      !ring-&gt;sched.thread)<br>
                      &gt;&gt; +                            continue;<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +                    /* No point to
                      resubmit jobs if we didn't HW reset*/<br>
                      &gt;&gt; +                    if
                      (!tmp_adev-&gt;asic_reset_res &amp;&amp;
                      !job_signaled)<br>
                      &gt;&gt; +                           
                      drm_sched_resubmit_jobs(&amp;ring-&gt;sched);<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +                   
                      drm_sched_start(&amp;ring-&gt;sched,
                      !tmp_adev-&gt;asic_reset_res);<br>
                      &gt;&gt; +            }<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +            if
                      (!amdgpu_device_has_dc_support(tmp_adev)
                      &amp;&amp; !job_signaled) {<br>
                      &gt;&gt; +                   
                      drm_helper_resume_force_mode(tmp_adev-&gt;ddev);<br>
                      &gt;&gt; +            }<br>
                      &gt;&gt; +<br>
                      &gt;&gt; +            tmp_adev-&gt;asic_reset_res
                      = 0;<br>
                      &gt;&gt;    <br>
                      &gt;&gt;               if (r) {<br>
                      &gt;&gt;                       /* bad news, how to
                      tell it to userspace ? */<br>
                      &gt;&gt; @@ -3633,7 +3680,7 @@ int
                      amdgpu_device_gpu_recover(struct amdgpu_device
                      *adev,<br>
                      &gt;&gt;              
                      amdgpu_device_unlock_adev(tmp_adev);<br>
                      &gt;&gt;       }<br>
                      &gt;&gt;    <br>
                      &gt;&gt; -    if (hive &amp;&amp;
                      adev-&gt;gmc.xgmi.num_physical_nodes &gt; 1)<br>
                      &gt;&gt; +    if (hive)<br>
                      &gt;&gt;              
                      mutex_unlock(&amp;hive-&gt;reset_lock);<br>
                      &gt;&gt;    <br>
                      &gt;&gt;       if (r)<br>
                      _______________________________________________<br>
                      amd-gfx mailing list<br>
                      <a href="mailto:amd-gfx@lists.freedesktop.org"
                        moz-do-not-send="true">amd-gfx@lists.freedesktop.org</a><br>
                      <a
                        href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx"
                        moz-do-not-send="true">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a><o:p></o:p></p>
                  </div>
                </blockquote>
              </div>
            </blockquote>
            <p class="MsoNormal"><o:p> </o:p></p>
          </blockquote>
        </div>
      </blockquote>
      <br>
      <fieldset class="mimeAttachmentHeader"></fieldset>
      <pre class="moz-quote-pre" wrap="">_______________________________________________
dri-devel mailing list
<a class="moz-txt-link-abbreviated" href="mailto:dri-devel@lists.freedesktop.org">dri-devel@lists.freedesktop.org</a>
<a class="moz-txt-link-freetext" href="https://lists.freedesktop.org/mailman/listinfo/dri-devel">https://lists.freedesktop.org/mailman/listinfo/dri-devel</a></pre>
    </blockquote>
    <br>
  </body>
</html>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a0e165c..85f8792 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3334,8 +3334,6 @@  static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		drm_sched_stop(&ring->sched, &job->base);
-
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
@@ -3343,6 +3341,7 @@  static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	if(job)
 		drm_sched_increase_karma(&job->base);
 
+	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
 	if (!amdgpu_sriov_vf(adev)) {
 
 		if (!need_full_reset)
@@ -3480,37 +3479,21 @@  static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 	return r;
 }
 
-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
 {
-	int i;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-		struct amdgpu_ring *ring = adev->rings[i];
-
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		if (!adev->asic_reset_res)
-			drm_sched_resubmit_jobs(&ring->sched);
+	if (trylock) {
+		if (!mutex_trylock(&adev->lock_reset))
+			return false;
+	} else
+		mutex_lock(&adev->lock_reset);
 
-		drm_sched_start(&ring->sched, !adev->asic_reset_res);
-	}
-
-	if (!amdgpu_device_has_dc_support(adev)) {
-		drm_helper_resume_force_mode(adev->ddev);
-	}
-
-	adev->asic_reset_res = 0;
-}
-
-static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
-{
-	mutex_lock(&adev->lock_reset);
 	atomic_inc(&adev->gpu_reset_counter);
 	adev->in_gpu_reset = 1;
 	/* Block kfd: SRIOV would do it separately */
 	if (!amdgpu_sriov_vf(adev))
                 amdgpu_amdkfd_pre_reset(adev);
+
+	return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
@@ -3538,40 +3521,42 @@  static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			      struct amdgpu_job *job)
 {
-	int r;
+	struct list_head device_list, *device_list_handle =  NULL;
+	bool need_full_reset, job_signaled;
 	struct amdgpu_hive_info *hive = NULL;
-	bool need_full_reset = false;
 	struct amdgpu_device *tmp_adev = NULL;
-	struct list_head device_list, *device_list_handle =  NULL;
+	int i, r = 0;
 
+	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
 	dev_info(adev->dev, "GPU reset begin!\n");
 
+	hive = amdgpu_get_xgmi_hive(adev, false);
+
 	/*
-	 * In case of XGMI hive disallow concurrent resets to be triggered
-	 * by different nodes. No point also since the one node already executing
-	 * reset will also reset all the other nodes in the hive.
+	 * Here we trylock to avoid chain of resets executing from
+	 * either trigger by jobs on different adevs in XGMI hive or jobs on
+	 * different schedulers for same device while this TO handler is running.
+	 * We always reset all schedulers for device and all devices for XGMI
+	 * hive so that should take care of them too.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev, 0);
-	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-	    !mutex_trylock(&hive->reset_lock))
+
+	if (hive && !mutex_trylock(&hive->reset_lock)) {
+		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
+			 job->base.id, hive->hive_id);
 		return 0;
+	}
 
 	/* Start with adev pre asic reset first for soft reset check.*/
-	amdgpu_device_lock_adev(adev);
-	r = amdgpu_device_pre_asic_reset(adev,
-					 job,
-					 &need_full_reset);
-	if (r) {
-		/*TODO Should we stop ?*/
-		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
-			  r, adev->ddev->unique);
-		adev->asic_reset_res = r;
+	if (!amdgpu_device_lock_adev(adev, !hive)) {
+		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
+					 job->base.id);
+		return 0;
 	}
 
 	/* Build list of devices to reset */
-	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
+	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
 		if (!hive) {
 			amdgpu_device_unlock_adev(adev);
 			return -ENODEV;
@@ -3588,13 +3573,56 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		device_list_handle = &device_list;
 	}
 
+	/* block all schedulers and reset given job's ring */
+	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = tmp_adev->rings[i];
+
+			if (!ring || !ring->sched.thread)
+				continue;
+
+			drm_sched_stop(&ring->sched, &job->base);
+		}
+	}
+
+
+	/*
+	 * Must check guilty signal here since after this point all old
+	 * HW fences are force signaled.
+	 *
+	 * job->base holds a reference to parent fence
+	 */
+	if (job && job->base.s_fence->parent &&
+	    dma_fence_is_signaled(job->base.s_fence->parent))
+		job_signaled = true;
+
+	if (!amdgpu_device_ip_need_full_reset(adev))
+		device_list_handle = &device_list;
+
+	if (job_signaled) {
+		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
+		goto skip_hw_reset;
+	}
+
+
+	/* Guilty job will be freed after this*/
+	r = amdgpu_device_pre_asic_reset(adev,
+					 job,
+					 &need_full_reset);
+	if (r) {
+		/*TODO Should we stop ?*/
+		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
+			  r, adev->ddev->unique);
+		adev->asic_reset_res = r;
+	}
+
 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 
 		if (tmp_adev == adev)
 			continue;
 
-		amdgpu_device_lock_adev(tmp_adev);
+		amdgpu_device_lock_adev(tmp_adev, false);
 		r = amdgpu_device_pre_asic_reset(tmp_adev,
 						 NULL,
 						 &need_full_reset);
@@ -3618,9 +3646,28 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			goto retry;
 	}
 
+skip_hw_reset:
+
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		amdgpu_device_post_asic_reset(tmp_adev);
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = tmp_adev->rings[i];
+
+			if (!ring || !ring->sched.thread)
+				continue;
+
+			/* No point to resubmit jobs if we didn't HW reset*/
+			if (!tmp_adev->asic_reset_res && !job_signaled)
+				drm_sched_resubmit_jobs(&ring->sched);
+
+			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
+		}
+
+		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
+			drm_helper_resume_force_mode(tmp_adev->ddev);
+		}
+
+		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
 			/* bad news, how to tell it to userspace ? */
@@ -3633,7 +3680,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_unlock_adev(tmp_adev);
 	}
 
-	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
+	if (hive)
 		mutex_unlock(&hive->reset_lock);
 
 	if (r)