diff mbox series

[2/2] drm/amdgpu/display: buffer INTERRUPT_LOW_IRQ_CONTEXT interrupt work

Message ID 1609740098-32603-2-git-send-email-xiaogang.chen@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm: distinguish return value of drm_dp_check_and_send_link_address. | expand

Commit Message

Chen, Xiaogang Jan. 4, 2021, 6:01 a.m. UTC
From: Xiaogang Chen <xiaogang.chen@amd.com>

amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
using work queue and uses single work_struct. If previous interrupt
has not been handled new interrupts(same type) will be discarded and
driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
If some important hpd, hpd_rx related interrupts are missed by driver
the hot (un)plug devices may cause system hang or unstable, such as
system resumes from S3 sleep with mst device connected.

This patch dynamically allocates new amdgpu_dm_irq_handler_data for
new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
has not been handled. So the new interrupt works can be queued to the
same workqueue_struct, instead discard the new interrupts.
All allocated amdgpu_dm_irq_handler_data are put into a single linked
list and will be reused after.

Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 ++++++++++++++-------
 2 files changed, 80 insertions(+), 48 deletions(-)

Comments

Chen, Xiaogang Jan. 12, 2021, 6:37 a.m. UTC | #1
[AMD Official Use Only - Internal Distribution Only]

Any comment?

-----Original Message-----
From: Xiaogang.Chen <xiaogang.chen@amd.com> 
Sent: Monday, January 4, 2021 12:02 AM
To: amd-gfx@lists.freedesktop.org; Wentland, Harry <Harry.Wentland@amd.com>; dri-devel@lists.freedesktop.org; airlied@linux.ie
Cc: Chen, Xiaogang <Xiaogang.Chen@amd.com>
Subject: [PATCH 2/2] drm/amdgpu/display: buffer INTERRUPT_LOW_IRQ_CONTEXT interrupt work

From: Xiaogang Chen <xiaogang.chen@amd.com>

amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by using work queue and uses single work_struct. If previous interrupt has not been handled new interrupts(same type) will be discarded and driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
If some important hpd, hpd_rx related interrupts are missed by driver the hot (un)plug devices may cause system hang or unstable, such as system resumes from S3 sleep with mst device connected.

This patch dynamically allocates new amdgpu_dm_irq_handler_data for new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work has not been handled. So the new interrupt works can be queued to the same workqueue_struct, instead discard the new interrupts.
All allocated amdgpu_dm_irq_handler_data are put into a single linked list and will be reused after.

Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 ++++++++++++++-------
 2 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index c9d82b9..730e540 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -69,18 +69,6 @@ struct common_irq_params {  };
 
 /**
- * struct irq_list_head - Linked-list for low context IRQ handlers.
- *
- * @head: The list_head within &struct handler_data
- * @work: A work_struct containing the deferred handler work
- */
-struct irq_list_head {
-	struct list_head head;
-	/* In case this interrupt needs post-processing, 'work' will be queued*/
-	struct work_struct work;
-};
-
-/**
  * struct dm_compressor_info - Buffer info used by frame buffer compression
  * @cpu_addr: MMIO cpu addr
  * @bo_ptr: Pointer to the buffer object @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
 	 * Note that handlers are called in the same order as they were
 	 * registered (FIFO).
 	 */
-	struct irq_list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
+	struct list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
 
 	/**
 	 * @irq_handler_list_high_tab:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
index 3577785..ada344a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
@@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
 	struct amdgpu_display_manager *dm;
 	/* DAL irq source which registered for this interrupt. */
 	enum dc_irq_source irq_source;
+	struct work_struct work;
 };
 
 #define DM_IRQ_TABLE_LOCK(adev, flags) \ @@ -111,20 +112,10 @@ static void init_handler_common_data(struct amdgpu_dm_irq_handler_data *hcd,
  */
 static void dm_irq_work_func(struct work_struct *work)  {
-	struct irq_list_head *irq_list_head =
-		container_of(work, struct irq_list_head, work);
-	struct list_head *handler_list = &irq_list_head->head;
-	struct amdgpu_dm_irq_handler_data *handler_data;
-
-	list_for_each_entry(handler_data, handler_list, list) {
-		DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
-				handler_data->irq_source);
+	struct amdgpu_dm_irq_handler_data *handler_data =
+	 container_of(work, struct amdgpu_dm_irq_handler_data, work);
 
-		DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
-			handler_data->irq_source);
-
-		handler_data->handler(handler_data->handler_arg);
-	}
+	handler_data->handler(handler_data->handler_arg);
 
 	/* Call a DAL subcomponent which registered for interrupt notification
 	 * at INTERRUPT_LOW_IRQ_CONTEXT.
@@ -156,7 +147,7 @@ static struct list_head *remove_irq_handler(struct amdgpu_device *adev,
 		break;
 	case INTERRUPT_LOW_IRQ_CONTEXT:
 	default:
-		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
 		break;
 	}
 
@@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct amdgpu_device *adev,
 		break;
 	case INTERRUPT_LOW_IRQ_CONTEXT:
 	default:
-		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+		INIT_WORK(&handler_data->work, dm_irq_work_func);
 		break;
 	}
 
@@ -369,7 +361,7 @@ void amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,  int amdgpu_dm_irq_init(struct amdgpu_device *adev)  {
 	int src;
-	struct irq_list_head *lh;
+	struct list_head *lh;
 
 	DRM_DEBUG_KMS("DM_IRQ\n");
 
@@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
 	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
 		/* low context handler list init */
 		lh = &adev->dm.irq_handler_list_low_tab[src];
-		INIT_LIST_HEAD(&lh->head);
-		INIT_WORK(&lh->work, dm_irq_work_func);
-
+		INIT_LIST_HEAD(lh);
 		/* high context handler init */
 		INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
 	}
@@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)  void amdgpu_dm_irq_fini(struct amdgpu_device *adev)  {
 	int src;
-	struct irq_list_head *lh;
+	struct list_head *lh;
+	struct list_head *entry, *tmp;
+	struct amdgpu_dm_irq_handler_data *handler;
 	unsigned long irq_table_flags;
+
 	DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
 	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
 		DM_IRQ_TABLE_LOCK(adev, irq_table_flags); @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
 		 * (because no code can schedule a new one). */
 		lh = &adev->dm.irq_handler_list_low_tab[src];
 		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-		flush_work(&lh->work);
+
+		if (!list_empty(lh)) {
+			list_for_each_safe(entry, tmp, lh) {
+
+				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+									 list);
+				flush_work(&handler->work);
+			}
+		}
 	}
 }
 
@@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
 	struct list_head *hnd_list_h;
 	struct list_head *hnd_list_l;
 	unsigned long irq_table_flags;
+	struct list_head *entry, *tmp;
+	struct amdgpu_dm_irq_handler_data *handler;
 
 	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
 
@@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
 	 * will be disabled from manage_dm_interrupts on disable CRTC.
 	 */
 	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, false);
 
 		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-		flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
 
+		if (!list_empty(hnd_list_l)) {
+
+			list_for_each_safe(entry, tmp, hnd_list_l) {
+
+				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+									 list);
+				flush_work(&handler->work);
+			}
+		}
 		DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
 	}
 
@@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct amdgpu_device *adev)
 
 	/* re-enable short pulse interrupts HW interrupt */
 	for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, true); @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
 	 * will be enabled from manage_dm_interrupts on enable CRTC.
 	 */
 	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, true); @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)  static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
 					enum dc_irq_source irq_source)
 {
-	unsigned long irq_table_flags;
-	struct work_struct *work = NULL;
 
-	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
+	struct  list_head *handler_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+	struct  amdgpu_dm_irq_handler_data *handler_data;
+	bool    work_queued = false;
 
-	if (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
-		work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
+	if (list_empty(handler_list))
+		return;
 
-	DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
+	list_for_each_entry(handler_data, handler_list, list) {
+
+		if (!queue_work(system_highpri_wq, &handler_data->work)) {
+			continue;
+		} else {
+			work_queued = true;
+			break;
+		}
+	}
+
+	if (!work_queued) {
+
+		struct  amdgpu_dm_irq_handler_data *handler_data_add;
+		/*get the amdgpu_dm_irq_handler_data of first item pointed by handler_list*/
+		handler_data = container_of(handler_list->next, struct 
+amdgpu_dm_irq_handler_data, list);
+
+		/*allocate a new amdgpu_dm_irq_handler_data*/
+		handler_data_add = kzalloc(sizeof(*handler_data), GFP_KERNEL);
+		if (!handler_data_add) {
+			DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
+			return;
+		}
+
+		/*copy new amdgpu_dm_irq_handler_data members from handler_data*/
+		handler_data_add->handler       = handler_data->handler;
+		handler_data_add->handler_arg   = handler_data->handler_arg;
+		handler_data_add->dm            = handler_data->dm;
+		handler_data_add->irq_source    = irq_source;
+
+		list_add_tail(&handler_data_add->list, handler_list);
+
+		INIT_WORK(&handler_data_add->work, dm_irq_work_func);
 
-	if (work) {
-		if (!schedule_work(work))
-			DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
-						irq_source);
+		if (queue_work(system_highpri_wq, &handler_data_add->work))
+			DRM_DEBUG("__func__: a work_struct is allocated and queued, "
+					 "src %d\n", irq_source);
+		else
+			DRM_ERROR("__func__: a new work_struct cannot be queued, "
+					  "something is wrong, src %d\n", irq_source);
 	}
 
 }
--
2.7.4
Andrey Grodzovsky Jan. 13, 2021, 4:54 a.m. UTC | #2
On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
> From: Xiaogang Chen <xiaogang.chen@amd.com>
>
> amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
> using work queue and uses single work_struct. If previous interrupt
> has not been handled new interrupts(same type) will be discarded and
> driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
> If some important hpd, hpd_rx related interrupts are missed by driver
> the hot (un)plug devices may cause system hang or unstable, such as
> system resumes from S3 sleep with mst device connected.
>
> This patch dynamically allocates new amdgpu_dm_irq_handler_data for
> new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
> has not been handled. So the new interrupt works can be queued to the
> same workqueue_struct, instead discard the new interrupts.
> All allocated amdgpu_dm_irq_handler_data are put into a single linked
> list and will be reused after.


I believe this creates a possible concurrency between already executing work item
and the new incoming one for which you allocate a new work item on the fly. While
handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing that for 
handle_hpd_rx_irq
it's not locked for MST use case (which is the most frequently used with this 
interrupt).  Did you
verified that handle_hpd_rx_irq is reentrant ?


>
> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 ++++++++++++++-------
>   2 files changed, 80 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> index c9d82b9..730e540 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> @@ -69,18 +69,6 @@ struct common_irq_params {
>   };
>   
>   /**
> - * struct irq_list_head - Linked-list for low context IRQ handlers.
> - *
> - * @head: The list_head within &struct handler_data
> - * @work: A work_struct containing the deferred handler work
> - */
> -struct irq_list_head {
> -	struct list_head head;
> -	/* In case this interrupt needs post-processing, 'work' will be queued*/
> -	struct work_struct work;
> -};
> -
> -/**
>    * struct dm_compressor_info - Buffer info used by frame buffer compression
>    * @cpu_addr: MMIO cpu addr
>    * @bo_ptr: Pointer to the buffer object
> @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
>   	 * Note that handlers are called in the same order as they were
>   	 * registered (FIFO).
>   	 */
> -	struct irq_list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
> +	struct list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>   
>   	/**
>   	 * @irq_handler_list_high_tab:
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
> index 3577785..ada344a 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
> @@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
>   	struct amdgpu_display_manager *dm;
>   	/* DAL irq source which registered for this interrupt. */
>   	enum dc_irq_source irq_source;
> +	struct work_struct work;
>   };
>   
>   #define DM_IRQ_TABLE_LOCK(adev, flags) \
> @@ -111,20 +112,10 @@ static void init_handler_common_data(struct amdgpu_dm_irq_handler_data *hcd,
>    */
>   static void dm_irq_work_func(struct work_struct *work)
>   {
> -	struct irq_list_head *irq_list_head =
> -		container_of(work, struct irq_list_head, work);
> -	struct list_head *handler_list = &irq_list_head->head;
> -	struct amdgpu_dm_irq_handler_data *handler_data;
> -
> -	list_for_each_entry(handler_data, handler_list, list) {
> -		DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
> -				handler_data->irq_source);
> +	struct amdgpu_dm_irq_handler_data *handler_data =
> +	 container_of(work, struct amdgpu_dm_irq_handler_data, work);
>   
> -		DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
> -			handler_data->irq_source);
> -
> -		handler_data->handler(handler_data->handler_arg);
> -	}
> +	handler_data->handler(handler_data->handler_arg);
>   
>   	/* Call a DAL subcomponent which registered for interrupt notification
>   	 * at INTERRUPT_LOW_IRQ_CONTEXT.
> @@ -156,7 +147,7 @@ static struct list_head *remove_irq_handler(struct amdgpu_device *adev,
>   		break;
>   	case INTERRUPT_LOW_IRQ_CONTEXT:
>   	default:
> -		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
> +		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>   		break;
>   	}
>   
> @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct amdgpu_device *adev,
>   		break;
>   	case INTERRUPT_LOW_IRQ_CONTEXT:
>   	default:
> -		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
> +		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
> +		INIT_WORK(&handler_data->work, dm_irq_work_func);
>   		break;
>   	}
>   
> @@ -369,7 +361,7 @@ void amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
>   int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>   {
>   	int src;
> -	struct irq_list_head *lh;
> +	struct list_head *lh;
>   
>   	DRM_DEBUG_KMS("DM_IRQ\n");
>   
> @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>   	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>   		/* low context handler list init */
>   		lh = &adev->dm.irq_handler_list_low_tab[src];
> -		INIT_LIST_HEAD(&lh->head);
> -		INIT_WORK(&lh->work, dm_irq_work_func);
> -
> +		INIT_LIST_HEAD(lh);
>   		/* high context handler init */
>   		INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
>   	}
> @@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>   void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>   {
>   	int src;
> -	struct irq_list_head *lh;
> +	struct list_head *lh;
> +	struct list_head *entry, *tmp;
> +	struct amdgpu_dm_irq_handler_data *handler;
>   	unsigned long irq_table_flags;
> +
>   	DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
>   	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>   		DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
> @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>   		 * (because no code can schedule a new one). */
>   		lh = &adev->dm.irq_handler_list_low_tab[src];
>   		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
> -		flush_work(&lh->work);
> +
> +		if (!list_empty(lh)) {
> +			list_for_each_safe(entry, tmp, lh) {
> +
> +				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
> +									 list);
> +				flush_work(&handler->work);
> +			}
> +		}
>   	}
>   }
>   
> @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
>   	struct list_head *hnd_list_h;
>   	struct list_head *hnd_list_l;
>   	unsigned long irq_table_flags;
> +	struct list_head *entry, *tmp;
> +	struct amdgpu_dm_irq_handler_data *handler;
>   
>   	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>   
> @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
>   	 * will be disabled from manage_dm_interrupts on disable CRTC.
>   	 */
>   	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
> -		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
> +		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>   		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>   		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>   			dc_interrupt_set(adev->dm.dc, src, false);
>   
>   		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
> -		flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
>   
> +		if (!list_empty(hnd_list_l)) {
> +
> +			list_for_each_safe(entry, tmp, hnd_list_l) {
> +
> +				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
> +									 list);
> +				flush_work(&handler->work);
> +			}
> +		}
>   		DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>   	}
>   
> @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct amdgpu_device *adev)
>   
>   	/* re-enable short pulse interrupts HW interrupt */
>   	for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
> -		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
> +		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>   		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>   		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>   			dc_interrupt_set(adev->dm.dc, src, true);
> @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
>   	 * will be enabled from manage_dm_interrupts on enable CRTC.
>   	 */
>   	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; src++) {
> -		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
> +		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>   		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>   		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>   			dc_interrupt_set(adev->dm.dc, src, true);
> @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
>   static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
>   					enum dc_irq_source irq_source)
>   {
> -	unsigned long irq_table_flags;
> -	struct work_struct *work = NULL;
>   
> -	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
> +	struct  list_head *handler_list = &adev->dm.irq_handler_list_low_tab[irq_source];
> +	struct  amdgpu_dm_irq_handler_data *handler_data;
> +	bool    work_queued = false;
>   
> -	if (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
> -		work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
> +	if (list_empty(handler_list))
> +		return;
>   
> -	DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
> +	list_for_each_entry(handler_data, handler_list, list) {
> +
> +		if (!queue_work(system_highpri_wq, &handler_data->work)) {
> +			continue;
> +		} else {
> +			work_queued = true;
> +			break;
> +		}
> +	}
> +
> +	if (!work_queued) {
> +
> +		struct  amdgpu_dm_irq_handler_data *handler_data_add;
> +		/*get the amdgpu_dm_irq_handler_data of first item pointed by handler_list*/
> +		handler_data = container_of(handler_list->next, struct amdgpu_dm_irq_handler_data, list);
> +
> +		/*allocate a new amdgpu_dm_irq_handler_data*/
> +		handler_data_add = kzalloc(sizeof(*handler_data), GFP_KERNEL);
> +		if (!handler_data_add) {
> +			DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
> +			return;
> +		}
> +
> +		/*copy new amdgpu_dm_irq_handler_data members from handler_data*/
> +		handler_data_add->handler       = handler_data->handler;
> +		handler_data_add->handler_arg   = handler_data->handler_arg;
> +		handler_data_add->dm            = handler_data->dm;
> +		handler_data_add->irq_source    = irq_source;
> +
> +		list_add_tail(&handler_data_add->list, handler_list);


At what place are you deleting completed work items from the handler_list ?

Andrey


> +
> +		INIT_WORK(&handler_data_add->work, dm_irq_work_func);
>   
> -	if (work) {
> -		if (!schedule_work(work))
> -			DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
> -						irq_source);
> +		if (queue_work(system_highpri_wq, &handler_data_add->work))
> +			DRM_DEBUG("__func__: a work_struct is allocated and queued, "
> +					 "src %d\n", irq_source);
> +		else
> +			DRM_ERROR("__func__: a new work_struct cannot be queued, "
> +					  "something is wrong, src %d\n", irq_source);
>   	}
>   
>   }
Chen, Xiaogang Jan. 14, 2021, 5:11 a.m. UTC | #3
On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:

On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
From: Xiaogang Chen <xiaogang.chen@amd.com><mailto:xiaogang.chen@amd.com>

amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
using work queue and uses single work_struct. If previous interrupt
has not been handled new interrupts(same type) will be discarded and
driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
If some important hpd, hpd_rx related interrupts are missed by driver
the hot (un)plug devices may cause system hang or unstable, such as
system resumes from S3 sleep with mst device connected.

This patch dynamically allocates new amdgpu_dm_irq_handler_data for
new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
has not been handled. So the new interrupt works can be queued to the
same workqueue_struct, instead discard the new interrupts.
All allocated amdgpu_dm_irq_handler_data are put into a single linked
list and will be reused after.


I believe this creates a possible concurrency between already executing work item
and the new incoming one for which you allocate a new work item on the fly. While
handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing that for handle_hpd_rx_irq
it's not locked for MST use case (which is the most frequently used with this interrupt).  Did you
verified that handle_hpd_rx_irq is reentrant ?


handle_hpd_rx_irq is put at a work queue. Its execution is serialized by the work queue. So there is no reentrant.

amdgpu_dm_irq_schedule_work does queuing of work(put handle_hpd_rx_irq into work queue). The first call is dm_irq_work_func, then call handle_hpd_rx_irq.


Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com><mailto:xiaogang.chen@amd.com>
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 ++++++++++++++-------
  2 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index c9d82b9..730e540 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -69,18 +69,6 @@ struct common_irq_params {
  };
    /**
- * struct irq_list_head - Linked-list for low context IRQ handlers.
- *
- * @head: The list_head within &struct handler_data
- * @work: A work_struct containing the deferred handler work
- */
-struct irq_list_head {
-    struct list_head head;
-    /* In case this interrupt needs post-processing, 'work' will be queued*/
-    struct work_struct work;
-};
-
-/**
   * struct dm_compressor_info - Buffer info used by frame buffer compression
   * @cpu_addr: MMIO cpu addr
   * @bo_ptr: Pointer to the buffer object
@@ -270,7 +258,7 @@ struct amdgpu_display_manager {
       * Note that handlers are called in the same order as they were
       * registered (FIFO).
       */
-    struct irq_list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
+    struct list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
        /**
       * @irq_handler_list_high_tab:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
index 3577785..ada344a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
@@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
      struct amdgpu_display_manager *dm;
      /* DAL irq source which registered for this interrupt. */
      enum dc_irq_source irq_source;
+    struct work_struct work;
  };
    #define DM_IRQ_TABLE_LOCK(adev, flags) \
@@ -111,20 +112,10 @@ static void init_handler_common_data(struct amdgpu_dm_irq_handler_data *hcd,
   */
  static void dm_irq_work_func(struct work_struct *work)
  {
-    struct irq_list_head *irq_list_head =
-        container_of(work, struct irq_list_head, work);
-    struct list_head *handler_list = &irq_list_head->head;
-    struct amdgpu_dm_irq_handler_data *handler_data;
-
-    list_for_each_entry(handler_data, handler_list, list) {
-        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
-                handler_data->irq_source);
+    struct amdgpu_dm_irq_handler_data *handler_data =
+     container_of(work, struct amdgpu_dm_irq_handler_data, work);
  -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
-            handler_data->irq_source);
-
-        handler_data->handler(handler_data->handler_arg);
-    }
+    handler_data->handler(handler_data->handler_arg);
        /* Call a DAL subcomponent which registered for interrupt notification
       * at INTERRUPT_LOW_IRQ_CONTEXT.
@@ -156,7 +147,7 @@ static struct list_head *remove_irq_handler(struct amdgpu_device *adev,
          break;
      case INTERRUPT_LOW_IRQ_CONTEXT:
      default:
-        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
          break;
      }
  @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct amdgpu_device *adev,
          break;
      case INTERRUPT_LOW_IRQ_CONTEXT:
      default:
-        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+        INIT_WORK(&handler_data->work, dm_irq_work_func);
          break;
      }
  @@ -369,7 +361,7 @@ void amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
  int amdgpu_dm_irq_init(struct amdgpu_device *adev)
  {
      int src;
-    struct irq_list_head *lh;
+    struct list_head *lh;
        DRM_DEBUG_KMS("DM_IRQ\n");
  @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
      for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
          /* low context handler list init */
          lh = &adev->dm.irq_handler_list_low_tab[src];
-        INIT_LIST_HEAD(&lh->head);
-        INIT_WORK(&lh->work, dm_irq_work_func);
-
+        INIT_LIST_HEAD(lh);
          /* high context handler init */
          INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
      }
@@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
  void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
  {
      int src;
-    struct irq_list_head *lh;
+    struct list_head *lh;
+    struct list_head *entry, *tmp;
+    struct amdgpu_dm_irq_handler_data *handler;
      unsigned long irq_table_flags;
+
      DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
      for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
          DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
@@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
           * (because no code can schedule a new one). */
          lh = &adev->dm.irq_handler_list_low_tab[src];
          DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-        flush_work(&lh->work);
+
+        if (!list_empty(lh)) {
+            list_for_each_safe(entry, tmp, lh) {
+
+                handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+                                     list);
+                flush_work(&handler->work);
+            }
+        }
      }
  }
  @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
      struct list_head *hnd_list_h;
      struct list_head *hnd_list_l;
      unsigned long irq_table_flags;
+    struct list_head *entry, *tmp;
+    struct amdgpu_dm_irq_handler_data *handler;
        DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
  @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
       * will be disabled from manage_dm_interrupts on disable CRTC.
       */
      for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
          hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
          if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
              dc_interrupt_set(adev->dm.dc, src, false);
            DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-        flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
  +        if (!list_empty(hnd_list_l)) {
+
+            list_for_each_safe(entry, tmp, hnd_list_l) {
+
+                handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+                                     list);
+                flush_work(&handler->work);
+            }
+        }
          DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
      }
  @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct amdgpu_device *adev)
        /* re-enable short pulse interrupts HW interrupt */
      for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
          hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
          if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
              dc_interrupt_set(adev->dm.dc, src, true);
@@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
       * will be enabled from manage_dm_interrupts on enable CRTC.
       */
      for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
          hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
          if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
              dc_interrupt_set(adev->dm.dc, src, true);
@@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
  static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
                      enum dc_irq_source irq_source)
  {
-    unsigned long irq_table_flags;
-    struct work_struct *work = NULL;
  -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
+    struct  list_head *handler_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+    struct  amdgpu_dm_irq_handler_data *handler_data;
+    bool    work_queued = false;
  -    if (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
-        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
+    if (list_empty(handler_list))
+        return;
  -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
+    list_for_each_entry(handler_data, handler_list, list) {
+
+        if (!queue_work(system_highpri_wq, &handler_data->work)) {
+            continue;
+        } else {
+            work_queued = true;
+            break;
+        }
+    }
+
+    if (!work_queued) {
+
+        struct  amdgpu_dm_irq_handler_data *handler_data_add;
+        /*get the amdgpu_dm_irq_handler_data of first item pointed by handler_list*/
+        handler_data = container_of(handler_list->next, struct amdgpu_dm_irq_handler_data, list);
+
+        /*allocate a new amdgpu_dm_irq_handler_data*/
+        handler_data_add = kzalloc(sizeof(*handler_data), GFP_KERNEL);
+        if (!handler_data_add) {
+            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
+            return;
+        }
+
+        /*copy new amdgpu_dm_irq_handler_data members from handler_data*/
+        handler_data_add->handler       = handler_data->handler;
+        handler_data_add->handler_arg   = handler_data->handler_arg;
+        handler_data_add->dm            = handler_data->dm;
+        handler_data_add->irq_source    = irq_source;
+
+        list_add_tail(&handler_data_add->list, handler_list);


At what place are you deleting completed work items from the handler_list ?

Andrey


The new allocated work item handler_data_add is put at end of single list handler_list. It is not deleted, but put into this list. In the future for same interrupt source handling the previous allocated work items can be reused.  So we do not have to reallocate new work items if previous interrupts have not been handled by cpu. On other side system will keep all the allocated work items at run time. Note, the new work item allocation only happens when cpu has not handled previous interrupts yet, and new same type interrupts have came.

Thanks

Xiaogang

+
+        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
  -    if (work) {
-        if (!schedule_work(work))
-            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
-                        irq_source);
+        if (queue_work(system_highpri_wq, &handler_data_add->work))
+            DRM_DEBUG("__func__: a work_struct is allocated and queued, "
+                     "src %d\n", irq_source);
+        else
+            DRM_ERROR("__func__: a new work_struct cannot be queued, "
+                      "something is wrong, src %d\n", irq_source);
      }
    }
Andrey Grodzovsky Jan. 14, 2021, 7:24 a.m. UTC | #4
On 1/14/21 12:11 AM, Chen, Xiaogang wrote:
> On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:
>>
>> On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>
>>> amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
>>> using work queue and uses single work_struct. If previous interrupt
>>> has not been handled new interrupts(same type) will be discarded and
>>> driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
>>> If some important hpd, hpd_rx related interrupts are missed by driver
>>> the hot (un)plug devices may cause system hang or unstable, such as
>>> system resumes from S3 sleep with mst device connected.
>>>
>>> This patch dynamically allocates new amdgpu_dm_irq_handler_data for
>>> new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
>>> has not been handled. So the new interrupt works can be queued to the
>>> same workqueue_struct, instead discard the new interrupts.
>>> All allocated amdgpu_dm_irq_handler_data are put into a single linked
>>> list and will be reused after.
>>
>>
>> I believe this creates a possible concurrency between already executing work 
>> item
>> and the new incoming one for which you allocate a new work item on the fly. 
>> While
>> handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing that for 
>> handle_hpd_rx_irq
>> it's not locked for MST use case (which is the most frequently used with this 
>> interrupt).  Did you
>> verified that handle_hpd_rx_irq is reentrant ?
>>
> handle_hpd_rx_irq is put at a work queue. Its execution is serialized by the 
> work queue. So there is no reentrant.
>
You are using system_highpri_wq which has the property that it has multiple 
workers thread pool spread across all the
active CPUs, see all work queue definitions here 
https://elixir.bootlin.com/linux/v5.11-rc3/source/include/linux/workqueue.h#L358
I beleieve that what you saying about no chance of reentrnacy would be correct 
if it would be same work item dequeued for execution
while previous instance is still running, see the explanation here - 
https://elixir.bootlin.com/linux/v5.11-rc3/source/kernel/workqueue.c#L1435.
Non reentrancy is guaranteed only for the same work item. If you want non 
reentrancy (full serializtion) for different work items you should create
you own single threaded work-queue using create_singlethread_workqueue


> amdgpu_dm_irq_schedule_work does queuing of work(put handle_hpd_rx_irq into 
> work queue). The first call is dm_irq_work_func, then call handle_hpd_rx_irq.
>>
>>>
>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
>>>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 
>>> ++++++++++++++-------
>>>   2 files changed, 80 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>> index c9d82b9..730e540 100644
>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>> @@ -69,18 +69,6 @@ struct common_irq_params {
>>>   };
>>>     /**
>>> - * struct irq_list_head - Linked-list for low context IRQ handlers.
>>> - *
>>> - * @head: The list_head within &struct handler_data
>>> - * @work: A work_struct containing the deferred handler work
>>> - */
>>> -struct irq_list_head {
>>> -    struct list_head head;
>>> -    /* In case this interrupt needs post-processing, 'work' will be queued*/
>>> -    struct work_struct work;
>>> -};
>>> -
>>> -/**
>>>    * struct dm_compressor_info - Buffer info used by frame buffer compression
>>>    * @cpu_addr: MMIO cpu addr
>>>    * @bo_ptr: Pointer to the buffer object
>>> @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
>>>        * Note that handlers are called in the same order as they were
>>>        * registered (FIFO).
>>>        */
>>> -    struct irq_list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>> +    struct list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>         /**
>>>        * @irq_handler_list_high_tab:
>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c 
>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>> index 3577785..ada344a 100644
>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>> @@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
>>>       struct amdgpu_display_manager *dm;
>>>       /* DAL irq source which registered for this interrupt. */
>>>       enum dc_irq_source irq_source;
>>> +    struct work_struct work;
>>>   };
>>>     #define DM_IRQ_TABLE_LOCK(adev, flags) \
>>> @@ -111,20 +112,10 @@ static void init_handler_common_data(struct 
>>> amdgpu_dm_irq_handler_data *hcd,
>>>    */
>>>   static void dm_irq_work_func(struct work_struct *work)
>>>   {
>>> -    struct irq_list_head *irq_list_head =
>>> -        container_of(work, struct irq_list_head, work);
>>> -    struct list_head *handler_list = &irq_list_head->head;
>>> -    struct amdgpu_dm_irq_handler_data *handler_data;
>>> -
>>> -    list_for_each_entry(handler_data, handler_list, list) {
>>> -        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
>>> -                handler_data->irq_source);
>>> +    struct amdgpu_dm_irq_handler_data *handler_data =
>>> +     container_of(work, struct amdgpu_dm_irq_handler_data, work);
>>>   -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
>>> -            handler_data->irq_source);
>>> -
>>> - handler_data->handler(handler_data->handler_arg);
>>> -    }
>>> +    handler_data->handler(handler_data->handler_arg);
>>>         /* Call a DAL subcomponent which registered for interrupt notification
>>>        * at INTERRUPT_LOW_IRQ_CONTEXT.
>>> @@ -156,7 +147,7 @@ static struct list_head *remove_irq_handler(struct 
>>> amdgpu_device *adev,
>>>           break;
>>>       case INTERRUPT_LOW_IRQ_CONTEXT:
>>>       default:
>>> -        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>           break;
>>>       }
>>>   @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct 
>>> amdgpu_device *adev,
>>>           break;
>>>       case INTERRUPT_LOW_IRQ_CONTEXT:
>>>       default:
>>> -        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>> +        INIT_WORK(&handler_data->work, dm_irq_work_func);
>>>           break;
>>>       }
>>>   @@ -369,7 +361,7 @@ void amdgpu_dm_irq_unregister_interrupt(struct 
>>> amdgpu_device *adev,
>>>   int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>   {
>>>       int src;
>>> -    struct irq_list_head *lh;
>>> +    struct list_head *lh;
>>>         DRM_DEBUG_KMS("DM_IRQ\n");
>>>   @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>           /* low context handler list init */
>>>           lh = &adev->dm.irq_handler_list_low_tab[src];
>>> -        INIT_LIST_HEAD(&lh->head);
>>> -        INIT_WORK(&lh->work, dm_irq_work_func);
>>> -
>>> +        INIT_LIST_HEAD(lh);
>>>           /* high context handler init */
>>> INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
>>>       }
>>> @@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>   void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>>>   {
>>>       int src;
>>> -    struct irq_list_head *lh;
>>> +    struct list_head *lh;
>>> +    struct list_head *entry, *tmp;
>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>       unsigned long irq_table_flags;
>>> +
>>>       DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
>>>       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>> @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>>>            * (because no code can schedule a new one). */
>>>           lh = &adev->dm.irq_handler_list_low_tab[src];
>>>           DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>> -        flush_work(&lh->work);
>>> +
>>> +        if (!list_empty(lh)) {
>>> +            list_for_each_safe(entry, tmp, lh) {
>>> +
>>> +                handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
>>> +                                     list);
>>> +                flush_work(&handler->work);
>>> +            }
>>> +        }
>>>       }
>>>   }
>>>   @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
>>>       struct list_head *hnd_list_h;
>>>       struct list_head *hnd_list_l;
>>>       unsigned long irq_table_flags;
>>> +    struct list_head *entry, *tmp;
>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>         DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>   @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
>>>        * will be disabled from manage_dm_interrupts on disable CRTC.
>>>        */
>>>       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>               dc_interrupt_set(adev->dm.dc, src, false);
>>>             DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>> - flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
>>>   +        if (!list_empty(hnd_list_l)) {
>>> +
>>> +            list_for_each_safe(entry, tmp, hnd_list_l) {
>>> +
>>> +                handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
>>> +                                     list);
>>> +                flush_work(&handler->work);
>>> +            }
>>> +        }
>>>           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>       }
>>>   @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct amdgpu_device 
>>> *adev)
>>>         /* re-enable short pulse interrupts HW interrupt */
>>>       for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>               dc_interrupt_set(adev->dm.dc, src, true);
>>> @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
>>>        * will be enabled from manage_dm_interrupts on enable CRTC.
>>>        */
>>>       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; src++) {
>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>               dc_interrupt_set(adev->dm.dc, src, true);
>>> @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
>>>   static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
>>>                       enum dc_irq_source irq_source)
>>>   {
>>> -    unsigned long irq_table_flags;
>>> -    struct work_struct *work = NULL;
>>>   -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>> +    struct  list_head *handler_list = 
>>> &adev->dm.irq_handler_list_low_tab[irq_source];
>>> +    struct  amdgpu_dm_irq_handler_data *handler_data;
>>> +    bool    work_queued = false;
>>>   -    if (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
>>> -        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
>>> +    if (list_empty(handler_list))
>>> +        return;
>>>   -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>> +    list_for_each_entry(handler_data, handler_list, list) {
>>> +
>>> +        if (!queue_work(system_highpri_wq, &handler_data->work)) {
>>> +            continue;
>>> +        } else {
>>> +            work_queued = true;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    if (!work_queued) {
>>> +
>>> +        struct  amdgpu_dm_irq_handler_data *handler_data_add;
>>> +        /*get the amdgpu_dm_irq_handler_data of first item pointed by 
>>> handler_list*/
>>> +        handler_data = container_of(handler_list->next, struct 
>>> amdgpu_dm_irq_handler_data, list);
>>> +
>>> +        /*allocate a new amdgpu_dm_irq_handler_data*/
>>> +        handler_data_add = kzalloc(sizeof(*handler_data), GFP_KERNEL);
>>> +        if (!handler_data_add) {
>>> +            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
>>> +            return;
>>> +        }
>>> +
>>> +        /*copy new amdgpu_dm_irq_handler_data members from handler_data*/
>>> +        handler_data_add->handler       = handler_data->handler;
>>> +        handler_data_add->handler_arg   = handler_data->handler_arg;
>>> +        handler_data_add->dm            = handler_data->dm;
>>> +        handler_data_add->irq_source    = irq_source;
>>> +
>>> +        list_add_tail(&handler_data_add->list, handler_list);
>>
>>
>> At what place are you deleting completed work items from the handler_list ?
>>
>> Andrey
>>
> The new allocated work item handler_data_add is put at end of single list 
> handler_list. It is not deleted, but put into this list. In the future for 
> same interrupt source handling the previous allocated work items can be 
> reused.  So we do not have to reallocate new work items if previous interrupts 
> have not been handled by cpu. On other side system will keep all the allocated 
> work items at run time. Note, the new work item allocation only happens when 
> cpu has not handled previous interrupts yet, and new same type interrupts have 
> came.
>
> Thanks
>
> Xiaogang
>

I am still confused, how you avoid executing a second time a handler you 
previously allocated because first queue_work failed,
you always start iteration from beginning of the list and you never remove 
already successfully executed handlers.

Andrey


>>
>>> +
>>> +        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
>>>   -    if (work) {
>>> -        if (!schedule_work(work))
>>> -            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
>>> -                        irq_source);
>>> +        if (queue_work(system_highpri_wq, &handler_data_add->work))
>>> +            DRM_DEBUG("__func__: a work_struct is allocated and queued, "
>>> +                     "src %d\n", irq_source);
>>> +        else
>>> +            DRM_ERROR("__func__: a new work_struct cannot be queued, "
>>> +                      "something is wrong, src %d\n", irq_source);
>>>       }
>>>     } 
>
>
Chen, Xiaogang Jan. 15, 2021, 7:21 a.m. UTC | #5
On 1/14/2021 1:24 AM, Grodzovsky, Andrey wrote:
>
>
> On 1/14/21 12:11 AM, Chen, Xiaogang wrote:
>> On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:
>>>
>>> On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>
>>>> amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
>>>> using work queue and uses single work_struct. If previous interrupt
>>>> has not been handled new interrupts(same type) will be discarded and
>>>> driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
>>>> If some important hpd, hpd_rx related interrupts are missed by driver
>>>> the hot (un)plug devices may cause system hang or unstable, such as
>>>> system resumes from S3 sleep with mst device connected.
>>>>
>>>> This patch dynamically allocates new amdgpu_dm_irq_handler_data for
>>>> new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
>>>> has not been handled. So the new interrupt works can be queued to the
>>>> same workqueue_struct, instead discard the new interrupts.
>>>> All allocated amdgpu_dm_irq_handler_data are put into a single linked
>>>> list and will be reused after.
>>>
>>>
>>> I believe this creates a possible concurrency between already 
>>> executing work item
>>> and the new incoming one for which you allocate a new work item on 
>>> the fly. While
>>> handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing 
>>> that for handle_hpd_rx_irq
>>> it's not locked for MST use case (which is the most frequently used 
>>> with this interrupt).  Did you
>>> verified that handle_hpd_rx_irq is reentrant ?
>>>
>> handle_hpd_rx_irq is put at a work queue. Its execution is serialized 
>> by the work queue. So there is no reentrant.
>>
> You are using system_highpri_wq which has the property that it has 
> multiple workers thread pool spread across all the
> active CPUs, see all work queue definitions here 
> https://elixir.bootlin.com/linux/v5.11-rc3/source/include/linux/workqueue.h#L358
> I beleieve that what you saying about no chance of reentrnacy would be 
> correct if it would be same work item dequeued for execution
> while previous instance is still running, see the explanation here - 
> https://elixir.bootlin.com/linux/v5.11-rc3/source/kernel/workqueue.c#L1435.
> Non reentrancy is guaranteed only for the same work item. If you want 
> non reentrancy (full serializtion) for different work items you should 
> create
> you own single threaded work-queue using create_singlethread_workqueue
>
>
Thank you. I think the easiest way is using aconnector->hpd_lock at 
handle_hpd_rx_irq to lock for dc_link->type == dc_connection_mst_branch 
case, right? I will do that at next version if you think it is ok.

>> amdgpu_dm_irq_schedule_work does queuing of work(put 
>> handle_hpd_rx_irq into work queue). The first call is 
>> dm_irq_work_func, then call handle_hpd_rx_irq.
>>>
>>>>
>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
>>>>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114 
>>>> ++++++++++++++-------
>>>>   2 files changed, 80 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>> index c9d82b9..730e540 100644
>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>> @@ -69,18 +69,6 @@ struct common_irq_params {
>>>>   };
>>>>     /**
>>>> - * struct irq_list_head - Linked-list for low context IRQ handlers.
>>>> - *
>>>> - * @head: The list_head within &struct handler_data
>>>> - * @work: A work_struct containing the deferred handler work
>>>> - */
>>>> -struct irq_list_head {
>>>> -    struct list_head head;
>>>> -    /* In case this interrupt needs post-processing, 'work' will 
>>>> be queued*/
>>>> -    struct work_struct work;
>>>> -};
>>>> -
>>>> -/**
>>>>    * struct dm_compressor_info - Buffer info used by frame buffer 
>>>> compression
>>>>    * @cpu_addr: MMIO cpu addr
>>>>    * @bo_ptr: Pointer to the buffer object
>>>> @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
>>>>        * Note that handlers are called in the same order as they were
>>>>        * registered (FIFO).
>>>>        */
>>>> -    struct irq_list_head 
>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>> +    struct list_head 
>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>>         /**
>>>>        * @irq_handler_list_high_tab:
>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c 
>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>> index 3577785..ada344a 100644
>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>> @@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
>>>>       struct amdgpu_display_manager *dm;
>>>>       /* DAL irq source which registered for this interrupt. */
>>>>       enum dc_irq_source irq_source;
>>>> +    struct work_struct work;
>>>>   };
>>>>     #define DM_IRQ_TABLE_LOCK(adev, flags) \
>>>> @@ -111,20 +112,10 @@ static void init_handler_common_data(struct 
>>>> amdgpu_dm_irq_handler_data *hcd,
>>>>    */
>>>>   static void dm_irq_work_func(struct work_struct *work)
>>>>   {
>>>> -    struct irq_list_head *irq_list_head =
>>>> -        container_of(work, struct irq_list_head, work);
>>>> -    struct list_head *handler_list = &irq_list_head->head;
>>>> -    struct amdgpu_dm_irq_handler_data *handler_data;
>>>> -
>>>> -    list_for_each_entry(handler_data, handler_list, list) {
>>>> -        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
>>>> -                handler_data->irq_source);
>>>> +    struct amdgpu_dm_irq_handler_data *handler_data =
>>>> +     container_of(work, struct amdgpu_dm_irq_handler_data, work);
>>>>   -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
>>>> -            handler_data->irq_source);
>>>> -
>>>> - handler_data->handler(handler_data->handler_arg);
>>>> -    }
>>>> +    handler_data->handler(handler_data->handler_arg);
>>>>         /* Call a DAL subcomponent which registered for interrupt 
>>>> notification
>>>>        * at INTERRUPT_LOW_IRQ_CONTEXT.
>>>> @@ -156,7 +147,7 @@ static struct list_head 
>>>> *remove_irq_handler(struct amdgpu_device *adev,
>>>>           break;
>>>>       case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>       default:
>>>> -        hnd_list = 
>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>           break;
>>>>       }
>>>>   @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>           break;
>>>>       case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>       default:
>>>> -        hnd_list = 
>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>> +        INIT_WORK(&handler_data->work, dm_irq_work_func);
>>>>           break;
>>>>       }
>>>>   @@ -369,7 +361,7 @@ void 
>>>> amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
>>>>   int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>>   {
>>>>       int src;
>>>> -    struct irq_list_head *lh;
>>>> +    struct list_head *lh;
>>>>         DRM_DEBUG_KMS("DM_IRQ\n");
>>>>   @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device 
>>>> *adev)
>>>>       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>           /* low context handler list init */
>>>>           lh = &adev->dm.irq_handler_list_low_tab[src];
>>>> -        INIT_LIST_HEAD(&lh->head);
>>>> -        INIT_WORK(&lh->work, dm_irq_work_func);
>>>> -
>>>> +        INIT_LIST_HEAD(lh);
>>>>           /* high context handler init */
>>>> INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
>>>>       }
>>>> @@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device 
>>>> *adev)
>>>>   void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>>>>   {
>>>>       int src;
>>>> -    struct irq_list_head *lh;
>>>> +    struct list_head *lh;
>>>> +    struct list_head *entry, *tmp;
>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>       unsigned long irq_table_flags;
>>>> +
>>>>       DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
>>>>       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>> @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device 
>>>> *adev)
>>>>            * (because no code can schedule a new one). */
>>>>           lh = &adev->dm.irq_handler_list_low_tab[src];
>>>>           DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>> -        flush_work(&lh->work);
>>>> +
>>>> +        if (!list_empty(lh)) {
>>>> +            list_for_each_safe(entry, tmp, lh) {
>>>> +
>>>> +                handler = list_entry(entry, struct 
>>>> amdgpu_dm_irq_handler_data,
>>>> +                                     list);
>>>> +                flush_work(&handler->work);
>>>> +            }
>>>> +        }
>>>>       }
>>>>   }
>>>>   @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct 
>>>> amdgpu_device *adev)
>>>>       struct list_head *hnd_list_h;
>>>>       struct list_head *hnd_list_l;
>>>>       unsigned long irq_table_flags;
>>>> +    struct list_head *entry, *tmp;
>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>         DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>   @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct 
>>>> amdgpu_device *adev)
>>>>        * will be disabled from manage_dm_interrupts on disable CRTC.
>>>>        */
>>>>       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; 
>>>> src++) {
>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>               dc_interrupt_set(adev->dm.dc, src, false);
>>>>             DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>> - flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
>>>>   +        if (!list_empty(hnd_list_l)) {
>>>> +
>>>> +            list_for_each_safe(entry, tmp, hnd_list_l) {
>>>> +
>>>> +                handler = list_entry(entry, struct 
>>>> amdgpu_dm_irq_handler_data,
>>>> +                                     list);
>>>> +                flush_work(&handler->work);
>>>> +            }
>>>> +        }
>>>>           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>       }
>>>>   @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct 
>>>> amdgpu_device *adev)
>>>>         /* re-enable short pulse interrupts HW interrupt */
>>>>       for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; 
>>>> src++) {
>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>               dc_interrupt_set(adev->dm.dc, src, true);
>>>> @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct 
>>>> amdgpu_device *adev)
>>>>        * will be enabled from manage_dm_interrupts on enable CRTC.
>>>>        */
>>>>       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; 
>>>> src++) {
>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>               dc_interrupt_set(adev->dm.dc, src, true);
>>>> @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct 
>>>> amdgpu_device *adev)
>>>>   static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
>>>>                       enum dc_irq_source irq_source)
>>>>   {
>>>> -    unsigned long irq_table_flags;
>>>> -    struct work_struct *work = NULL;
>>>>   -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>> +    struct  list_head *handler_list = 
>>>> &adev->dm.irq_handler_list_low_tab[irq_source];
>>>> +    struct  amdgpu_dm_irq_handler_data *handler_data;
>>>> +    bool    work_queued = false;
>>>>   -    if 
>>>> (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
>>>> -        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
>>>> +    if (list_empty(handler_list))
>>>> +        return;
>>>>   -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>> +    list_for_each_entry(handler_data, handler_list, list) {
>>>> +
>>>> +        if (!queue_work(system_highpri_wq, &handler_data->work)) {
>>>> +            continue;
>>>> +        } else {
>>>> +            work_queued = true;
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (!work_queued) {
>>>> +
>>>> +        struct  amdgpu_dm_irq_handler_data *handler_data_add;
>>>> +        /*get the amdgpu_dm_irq_handler_data of first item pointed 
>>>> by handler_list*/
>>>> +        handler_data = container_of(handler_list->next, struct 
>>>> amdgpu_dm_irq_handler_data, list);
>>>> +
>>>> +        /*allocate a new amdgpu_dm_irq_handler_data*/
>>>> +        handler_data_add = kzalloc(sizeof(*handler_data), 
>>>> GFP_KERNEL);
>>>> +        if (!handler_data_add) {
>>>> +            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
>>>> +            return;
>>>> +        }
>>>> +
>>>> +        /*copy new amdgpu_dm_irq_handler_data members from 
>>>> handler_data*/
>>>> +        handler_data_add->handler       = handler_data->handler;
>>>> +        handler_data_add->handler_arg   = handler_data->handler_arg;
>>>> +        handler_data_add->dm            = handler_data->dm;
>>>> +        handler_data_add->irq_source    = irq_source;
>>>> +
>>>> +        list_add_tail(&handler_data_add->list, handler_list);
>>>
>>>
>>> At what place are you deleting completed work items from the 
>>> handler_list ?
>>>
>>> Andrey
>>>
>> The new allocated work item handler_data_add is put at end of single 
>> list handler_list. It is not deleted, but put into this list. In the 
>> future for same interrupt source handling the previous allocated work 
>> items can be reused.  So we do not have to reallocate new work items 
>> if previous interrupts have not been handled by cpu. On other side 
>> system will keep all the allocated work items at run time. Note, the 
>> new work item allocation only happens when cpu has not handled 
>> previous interrupts yet, and new same type interrupts have came.
>>
>> Thanks
>>
>> Xiaogang
>>
>
> I am still confused, how you avoid executing a second time a handler 
> you previously allocated because first queue_work failed,
> you always start iteration from beginning of the list and you never 
> remove already successfully executed handlers.
>
> Andrey
>
>
Not sure if understand your question. If the first time queuing work 
item success there is no second time queuing the same work item. The 
second work item is a different one although they use same 
handle_hpd_rx_irq. The queued work items are managed by work 
queue(queue_work). Work queue knows each queued work item status: still 
on queue, running, or exist. I look from already allocated work items to 
see if work queue allows me to queue, until find one. If all allocated 
work items cannot be queued, allocate a new work item and put it at the 
list of allocated work items.

Thanks

Xiaogang



>>>
>>>> +
>>>> +        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
>>>>   -    if (work) {
>>>> -        if (!schedule_work(work))
>>>> -            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
>>>> -                        irq_source);
>>>> +        if (queue_work(system_highpri_wq, &handler_data_add->work))
>>>> +            DRM_DEBUG("__func__: a work_struct is allocated and 
>>>> queued, "
>>>> +                     "src %d\n", irq_source);
>>>> +        else
>>>> +            DRM_ERROR("__func__: a new work_struct cannot be 
>>>> queued, "
>>>> +                      "something is wrong, src %d\n", irq_source);
>>>>       }
>>>>     } 
>>
>>
Andrey Grodzovsky Jan. 19, 2021, 10:29 p.m. UTC | #6
On 1/15/21 2:21 AM, Chen, Xiaogang wrote:
> On 1/14/2021 1:24 AM, Grodzovsky, Andrey wrote:
>>
>> On 1/14/21 12:11 AM, Chen, Xiaogang wrote:
>>> On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:
>>>> On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
>>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>
>>>>> amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
>>>>> using work queue and uses single work_struct. If previous interrupt
>>>>> has not been handled new interrupts(same type) will be discarded and
>>>>> driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
>>>>> If some important hpd, hpd_rx related interrupts are missed by driver
>>>>> the hot (un)plug devices may cause system hang or unstable, such as
>>>>> system resumes from S3 sleep with mst device connected.
>>>>>
>>>>> This patch dynamically allocates new amdgpu_dm_irq_handler_data for
>>>>> new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
>>>>> has not been handled. So the new interrupt works can be queued to the
>>>>> same workqueue_struct, instead discard the new interrupts.
>>>>> All allocated amdgpu_dm_irq_handler_data are put into a single linked
>>>>> list and will be reused after.
>>>>
>>>> I believe this creates a possible concurrency between already
>>>> executing work item
>>>> and the new incoming one for which you allocate a new work item on
>>>> the fly. While
>>>> handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing
>>>> that for handle_hpd_rx_irq
>>>> it's not locked for MST use case (which is the most frequently used
>>>> with this interrupt).  Did you
>>>> verified that handle_hpd_rx_irq is reentrant ?
>>>>
>>> handle_hpd_rx_irq is put at a work queue. Its execution is serialized
>>> by the work queue. So there is no reentrant.
>>>
>> You are using system_highpri_wq which has the property that it has
>> multiple workers thread pool spread across all the
>> active CPUs, see all work queue definitions here
>> https://elixir.bootlin.com/linux/v5.11-rc3/source/include/linux/workqueue.h#L358
>> I beleieve that what you saying about no chance of reentrnacy would be
>> correct if it would be same work item dequeued for execution
>> while previous instance is still running, see the explanation here -
>> https://elixir.bootlin.com/linux/v5.11-rc3/source/kernel/workqueue.c#L1435.
>> Non reentrancy is guaranteed only for the same work item. If you want
>> non reentrancy (full serializtion) for different work items you should
>> create
>> you own single threaded work-queue using create_singlethread_workqueue
>>
>>
> Thank you. I think the easiest way is using aconnector->hpd_lock at
> handle_hpd_rx_irq to lock for dc_link->type == dc_connection_mst_branch
> case, right? I will do that at next version if you think it is ok.


I am not sure what are the consequences of of using hpd lock there with
regard to other locks acquired in DRM MST code during MST related HPD 
transactions since
i haven't dealt with this for a very long time. Maybe Harry or Nick can advise 
on this ?

Andrey

>
>>> amdgpu_dm_irq_schedule_work does queuing of work(put
>>> handle_hpd_rx_irq into work queue). The first call is
>>> dm_irq_work_func, then call handle_hpd_rx_irq.
>>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
>>>>>    .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114
>>>>> ++++++++++++++-------
>>>>>    2 files changed, 80 insertions(+), 48 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>> index c9d82b9..730e540 100644
>>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>> @@ -69,18 +69,6 @@ struct common_irq_params {
>>>>>    };
>>>>>      /**
>>>>> - * struct irq_list_head - Linked-list for low context IRQ handlers.
>>>>> - *
>>>>> - * @head: The list_head within &struct handler_data
>>>>> - * @work: A work_struct containing the deferred handler work
>>>>> - */
>>>>> -struct irq_list_head {
>>>>> -    struct list_head head;
>>>>> -    /* In case this interrupt needs post-processing, 'work' will
>>>>> be queued*/
>>>>> -    struct work_struct work;
>>>>> -};
>>>>> -
>>>>> -/**
>>>>>     * struct dm_compressor_info - Buffer info used by frame buffer
>>>>> compression
>>>>>     * @cpu_addr: MMIO cpu addr
>>>>>     * @bo_ptr: Pointer to the buffer object
>>>>> @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
>>>>>         * Note that handlers are called in the same order as they were
>>>>>         * registered (FIFO).
>>>>>         */
>>>>> -    struct irq_list_head
>>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>>> +    struct list_head
>>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>>>          /**
>>>>>         * @irq_handler_list_high_tab:
>>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>> index 3577785..ada344a 100644
>>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>> @@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
>>>>>        struct amdgpu_display_manager *dm;
>>>>>        /* DAL irq source which registered for this interrupt. */
>>>>>        enum dc_irq_source irq_source;
>>>>> +    struct work_struct work;
>>>>>    };
>>>>>      #define DM_IRQ_TABLE_LOCK(adev, flags) \
>>>>> @@ -111,20 +112,10 @@ static void init_handler_common_data(struct
>>>>> amdgpu_dm_irq_handler_data *hcd,
>>>>>     */
>>>>>    static void dm_irq_work_func(struct work_struct *work)
>>>>>    {
>>>>> -    struct irq_list_head *irq_list_head =
>>>>> -        container_of(work, struct irq_list_head, work);
>>>>> -    struct list_head *handler_list = &irq_list_head->head;
>>>>> -    struct amdgpu_dm_irq_handler_data *handler_data;
>>>>> -
>>>>> -    list_for_each_entry(handler_data, handler_list, list) {
>>>>> -        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
>>>>> -                handler_data->irq_source);
>>>>> +    struct amdgpu_dm_irq_handler_data *handler_data =
>>>>> +     container_of(work, struct amdgpu_dm_irq_handler_data, work);
>>>>>    -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
>>>>> -            handler_data->irq_source);
>>>>> -
>>>>> - handler_data->handler(handler_data->handler_arg);
>>>>> -    }
>>>>> +    handler_data->handler(handler_data->handler_arg);
>>>>>          /* Call a DAL subcomponent which registered for interrupt
>>>>> notification
>>>>>         * at INTERRUPT_LOW_IRQ_CONTEXT.
>>>>> @@ -156,7 +147,7 @@ static struct list_head
>>>>> *remove_irq_handler(struct amdgpu_device *adev,
>>>>>            break;
>>>>>        case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>>        default:
>>>>> -        hnd_list =
>>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>>            break;
>>>>>        }
>>>>>    @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct
>>>>> amdgpu_device *adev,
>>>>>            break;
>>>>>        case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>>        default:
>>>>> -        hnd_list =
>>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>> +        INIT_WORK(&handler_data->work, dm_irq_work_func);
>>>>>            break;
>>>>>        }
>>>>>    @@ -369,7 +361,7 @@ void
>>>>> amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
>>>>>    int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>>>    {
>>>>>        int src;
>>>>> -    struct irq_list_head *lh;
>>>>> +    struct list_head *lh;
>>>>>          DRM_DEBUG_KMS("DM_IRQ\n");
>>>>>    @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device
>>>>> *adev)
>>>>>        for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>>            /* low context handler list init */
>>>>>            lh = &adev->dm.irq_handler_list_low_tab[src];
>>>>> -        INIT_LIST_HEAD(&lh->head);
>>>>> -        INIT_WORK(&lh->work, dm_irq_work_func);
>>>>> -
>>>>> +        INIT_LIST_HEAD(lh);
>>>>>            /* high context handler init */
>>>>> INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
>>>>>        }
>>>>> @@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device
>>>>> *adev)
>>>>>    void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>>>>>    {
>>>>>        int src;
>>>>> -    struct irq_list_head *lh;
>>>>> +    struct list_head *lh;
>>>>> +    struct list_head *entry, *tmp;
>>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>>        unsigned long irq_table_flags;
>>>>> +
>>>>>        DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
>>>>>        for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>>            DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>> @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device
>>>>> *adev)
>>>>>             * (because no code can schedule a new one). */
>>>>>            lh = &adev->dm.irq_handler_list_low_tab[src];
>>>>>            DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>> -        flush_work(&lh->work);
>>>>> +
>>>>> +        if (!list_empty(lh)) {
>>>>> +            list_for_each_safe(entry, tmp, lh) {
>>>>> +
>>>>> +                handler = list_entry(entry, struct
>>>>> amdgpu_dm_irq_handler_data,
>>>>> +                                     list);
>>>>> +                flush_work(&handler->work);
>>>>> +            }
>>>>> +        }
>>>>>        }
>>>>>    }
>>>>>    @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct
>>>>> amdgpu_device *adev)
>>>>>        struct list_head *hnd_list_h;
>>>>>        struct list_head *hnd_list_l;
>>>>>        unsigned long irq_table_flags;
>>>>> +    struct list_head *entry, *tmp;
>>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>>          DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>    @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct
>>>>> amdgpu_device *adev)
>>>>>         * will be disabled from manage_dm_interrupts on disable CRTC.
>>>>>         */
>>>>>        for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX;
>>>>> src++) {
>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>                dc_interrupt_set(adev->dm.dc, src, false);
>>>>>              DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>> - flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
>>>>>    +        if (!list_empty(hnd_list_l)) {
>>>>> +
>>>>> +            list_for_each_safe(entry, tmp, hnd_list_l) {
>>>>> +
>>>>> +                handler = list_entry(entry, struct
>>>>> amdgpu_dm_irq_handler_data,
>>>>> +                                     list);
>>>>> +                flush_work(&handler->work);
>>>>> +            }
>>>>> +        }
>>>>>            DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>        }
>>>>>    @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct
>>>>> amdgpu_device *adev)
>>>>>          /* re-enable short pulse interrupts HW interrupt */
>>>>>        for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX;
>>>>> src++) {
>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>                dc_interrupt_set(adev->dm.dc, src, true);
>>>>> @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct
>>>>> amdgpu_device *adev)
>>>>>         * will be enabled from manage_dm_interrupts on enable CRTC.
>>>>>         */
>>>>>        for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6;
>>>>> src++) {
>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>                dc_interrupt_set(adev->dm.dc, src, true);
>>>>> @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct
>>>>> amdgpu_device *adev)
>>>>>    static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
>>>>>                        enum dc_irq_source irq_source)
>>>>>    {
>>>>> -    unsigned long irq_table_flags;
>>>>> -    struct work_struct *work = NULL;
>>>>>    -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>> +    struct  list_head *handler_list =
>>>>> &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>> +    struct  amdgpu_dm_irq_handler_data *handler_data;
>>>>> +    bool    work_queued = false;
>>>>>    -    if
>>>>> (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
>>>>> -        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
>>>>> +    if (list_empty(handler_list))
>>>>> +        return;
>>>>>    -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>> +    list_for_each_entry(handler_data, handler_list, list) {
>>>>> +
>>>>> +        if (!queue_work(system_highpri_wq, &handler_data->work)) {
>>>>> +            continue;
>>>>> +        } else {
>>>>> +            work_queued = true;
>>>>> +            break;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    if (!work_queued) {
>>>>> +
>>>>> +        struct  amdgpu_dm_irq_handler_data *handler_data_add;
>>>>> +        /*get the amdgpu_dm_irq_handler_data of first item pointed
>>>>> by handler_list*/
>>>>> +        handler_data = container_of(handler_list->next, struct
>>>>> amdgpu_dm_irq_handler_data, list);
>>>>> +
>>>>> +        /*allocate a new amdgpu_dm_irq_handler_data*/
>>>>> +        handler_data_add = kzalloc(sizeof(*handler_data),
>>>>> GFP_KERNEL);
>>>>> +        if (!handler_data_add) {
>>>>> +            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
>>>>> +            return;
>>>>> +        }
>>>>> +
>>>>> +        /*copy new amdgpu_dm_irq_handler_data members from
>>>>> handler_data*/
>>>>> +        handler_data_add->handler       = handler_data->handler;
>>>>> +        handler_data_add->handler_arg   = handler_data->handler_arg;
>>>>> +        handler_data_add->dm            = handler_data->dm;
>>>>> +        handler_data_add->irq_source    = irq_source;
>>>>> +
>>>>> +        list_add_tail(&handler_data_add->list, handler_list);
>>>>
>>>> At what place are you deleting completed work items from the
>>>> handler_list ?
>>>>
>>>> Andrey
>>>>
>>> The new allocated work item handler_data_add is put at end of single
>>> list handler_list. It is not deleted, but put into this list. In the
>>> future for same interrupt source handling the previous allocated work
>>> items can be reused.  So we do not have to reallocate new work items
>>> if previous interrupts have not been handled by cpu. On other side
>>> system will keep all the allocated work items at run time. Note, the
>>> new work item allocation only happens when cpu has not handled
>>> previous interrupts yet, and new same type interrupts have came.
>>>
>>> Thanks
>>>
>>> Xiaogang
>>>
>> I am still confused, how you avoid executing a second time a handler
>> you previously allocated because first queue_work failed,
>> you always start iteration from beginning of the list and you never
>> remove already successfully executed handlers.
>>
>> Andrey
>>
>>
> Not sure if understand your question. If the first time queuing work
> item success there is no second time queuing the same work item. The
> second work item is a different one although they use same
> handle_hpd_rx_irq. The queued work items are managed by work
> queue(queue_work). Work queue knows each queued work item status: still
> on queue, running, or exist. I look from already allocated work items to
> see if work queue allows me to queue, until find one. If all allocated
> work items cannot be queued, allocate a new work item and put it at the
> list of allocated work items.
>
> Thanks
>
> Xiaogang
>
>
>
>>>>> +
>>>>> +        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
>>>>>    -    if (work) {
>>>>> -        if (!schedule_work(work))
>>>>> -            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
>>>>> -                        irq_source);
>>>>> +        if (queue_work(system_highpri_wq, &handler_data_add->work))
>>>>> +            DRM_DEBUG("__func__: a work_struct is allocated and
>>>>> queued, "
>>>>> +                     "src %d\n", irq_source);
>>>>> +        else
>>>>> +            DRM_ERROR("__func__: a new work_struct cannot be
>>>>> queued, "
>>>>> +                      "something is wrong, src %d\n", irq_source);
>>>>>        }
>>>>>      }
>>>
Chen, Xiaogang Jan. 22, 2021, 8:55 p.m. UTC | #7
On 1/19/2021 4:29 PM, Grodzovsky, Andrey wrote:

On 1/15/21 2:21 AM, Chen, Xiaogang wrote:
On 1/14/2021 1:24 AM, Grodzovsky, Andrey wrote:

On 1/14/21 12:11 AM, Chen, Xiaogang wrote:
On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:
On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
From: Xiaogang Chen <xiaogang.chen@amd.com><mailto:xiaogang.chen@amd.com>

amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, hpd_rx) by
using work queue and uses single work_struct. If previous interrupt
has not been handled new interrupts(same type) will be discarded and
driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
If some important hpd, hpd_rx related interrupts are missed by driver
the hot (un)plug devices may cause system hang or unstable, such as
system resumes from S3 sleep with mst device connected.

This patch dynamically allocates new amdgpu_dm_irq_handler_data for
new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
has not been handled. So the new interrupt works can be queued to the
same workqueue_struct, instead discard the new interrupts.
All allocated amdgpu_dm_irq_handler_data are put into a single linked
list and will be reused after.

I believe this creates a possible concurrency between already
executing work item
and the new incoming one for which you allocate a new work item on
the fly. While
handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing
that for handle_hpd_rx_irq
it's not locked for MST use case (which is the most frequently used
with this interrupt).  Did you
verified that handle_hpd_rx_irq is reentrant ?

handle_hpd_rx_irq is put at a work queue. Its execution is serialized
by the work queue. So there is no reentrant.

You are using system_highpri_wq which has the property that it has
multiple workers thread pool spread across all the
active CPUs, see all work queue definitions here
https://elixir.bootlin.com/linux/v5.11-rc3/source/include/linux/workqueue.h#L358
I beleieve that what you saying about no chance of reentrnacy would be
correct if it would be same work item dequeued for execution
while previous instance is still running, see the explanation here -
https://elixir.bootlin.com/linux/v5.11-rc3/source/kernel/workqueue.c#L1435.
Non reentrancy is guaranteed only for the same work item. If you want
non reentrancy (full serializtion) for different work items you should
create
you own single threaded work-queue using create_singlethread_workqueue


Thank you. I think the easiest way is using aconnector->hpd_lock at
handle_hpd_rx_irq to lock for dc_link->type == dc_connection_mst_branch
case, right? I will do that at next version if you think it is ok.


I am not sure what are the consequences of of using hpd lock there with
regard to other locks acquired in DRM MST code during MST related HPD transactions since
i haven't dealt with this for a very long time. Maybe Harry or Nick can advise on this ?

Andrey


Hi Harry, Nick: would you or someone give review on this patch?

Thanks

Xiaogang

amdgpu_dm_irq_schedule_work does queuing of work(put
handle_hpd_rx_irq into work queue). The first call is
dm_irq_work_func, then call handle_hpd_rx_irq.
Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com><mailto:xiaogang.chen@amd.com>
---
   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114
++++++++++++++-------
   2 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index c9d82b9..730e540 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -69,18 +69,6 @@ struct common_irq_params {
   };
     /**
- * struct irq_list_head - Linked-list for low context IRQ handlers.
- *
- * @head: The list_head within &struct handler_data
- * @work: A work_struct containing the deferred handler work
- */
-struct irq_list_head {
-    struct list_head head;
-    /* In case this interrupt needs post-processing, 'work' will
be queued*/
-    struct work_struct work;
-};
-
-/**
    * struct dm_compressor_info - Buffer info used by frame buffer
compression
    * @cpu_addr: MMIO cpu addr
    * @bo_ptr: Pointer to the buffer object
@@ -270,7 +258,7 @@ struct amdgpu_display_manager {
        * Note that handlers are called in the same order as they were
        * registered (FIFO).
        */
-    struct irq_list_head
irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
+    struct list_head
irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
         /**
        * @irq_handler_list_high_tab:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
index 3577785..ada344a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
@@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
       struct amdgpu_display_manager *dm;
       /* DAL irq source which registered for this interrupt. */
       enum dc_irq_source irq_source;
+    struct work_struct work;
   };
     #define DM_IRQ_TABLE_LOCK(adev, flags) \
@@ -111,20 +112,10 @@ static void init_handler_common_data(struct
amdgpu_dm_irq_handler_data *hcd,
    */
   static void dm_irq_work_func(struct work_struct *work)
   {
-    struct irq_list_head *irq_list_head =
-        container_of(work, struct irq_list_head, work);
-    struct list_head *handler_list = &irq_list_head->head;
-    struct amdgpu_dm_irq_handler_data *handler_data;
-
-    list_for_each_entry(handler_data, handler_list, list) {
-        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
-                handler_data->irq_source);
+    struct amdgpu_dm_irq_handler_data *handler_data =
+     container_of(work, struct amdgpu_dm_irq_handler_data, work);
   -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
-            handler_data->irq_source);
-
- handler_data->handler(handler_data->handler_arg);
-    }
+    handler_data->handler(handler_data->handler_arg);
         /* Call a DAL subcomponent which registered for interrupt
notification
        * at INTERRUPT_LOW_IRQ_CONTEXT.
@@ -156,7 +147,7 @@ static struct list_head
*remove_irq_handler(struct amdgpu_device *adev,
           break;
       case INTERRUPT_LOW_IRQ_CONTEXT:
       default:
-        hnd_list =
&adev->dm.irq_handler_list_low_tab[irq_source].head;
+        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
           break;
       }
   @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct
amdgpu_device *adev,
           break;
       case INTERRUPT_LOW_IRQ_CONTEXT:
       default:
-        hnd_list =
&adev->dm.irq_handler_list_low_tab[irq_source].head;
+        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+        INIT_WORK(&handler_data->work, dm_irq_work_func);
           break;
       }
   @@ -369,7 +361,7 @@ void
amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
   int amdgpu_dm_irq_init(struct amdgpu_device *adev)
   {
       int src;
-    struct irq_list_head *lh;
+    struct list_head *lh;
         DRM_DEBUG_KMS("DM_IRQ\n");
   @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device
*adev)
       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
           /* low context handler list init */
           lh = &adev->dm.irq_handler_list_low_tab[src];
-        INIT_LIST_HEAD(&lh->head);
-        INIT_WORK(&lh->work, dm_irq_work_func);
-
+        INIT_LIST_HEAD(lh);
           /* high context handler init */
INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
       }
@@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device
*adev)
   void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
   {
       int src;
-    struct irq_list_head *lh;
+    struct list_head *lh;
+    struct list_head *entry, *tmp;
+    struct amdgpu_dm_irq_handler_data *handler;
       unsigned long irq_table_flags;
+
       DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
       for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
@@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device
*adev)
            * (because no code can schedule a new one). */
           lh = &adev->dm.irq_handler_list_low_tab[src];
           DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-        flush_work(&lh->work);
+
+        if (!list_empty(lh)) {
+            list_for_each_safe(entry, tmp, lh) {
+
+                handler = list_entry(entry, struct
amdgpu_dm_irq_handler_data,
+                                     list);
+                flush_work(&handler->work);
+            }
+        }
       }
   }
   @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct
amdgpu_device *adev)
       struct list_head *hnd_list_h;
       struct list_head *hnd_list_l;
       unsigned long irq_table_flags;
+    struct list_head *entry, *tmp;
+    struct amdgpu_dm_irq_handler_data *handler;
         DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
   @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct
amdgpu_device *adev)
        * will be disabled from manage_dm_interrupts on disable CRTC.
        */
       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX;
src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
               dc_interrupt_set(adev->dm.dc, src, false);
             DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
- flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
   +        if (!list_empty(hnd_list_l)) {
+
+            list_for_each_safe(entry, tmp, hnd_list_l) {
+
+                handler = list_entry(entry, struct
amdgpu_dm_irq_handler_data,
+                                     list);
+                flush_work(&handler->work);
+            }
+        }
           DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
       }
   @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct
amdgpu_device *adev)
         /* re-enable short pulse interrupts HW interrupt */
       for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX;
src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
               dc_interrupt_set(adev->dm.dc, src, true);
@@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct
amdgpu_device *adev)
        * will be enabled from manage_dm_interrupts on enable CRTC.
        */
       for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6;
src++) {
-        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
           hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
           if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
               dc_interrupt_set(adev->dm.dc, src, true);
@@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct
amdgpu_device *adev)
   static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
                       enum dc_irq_source irq_source)
   {
-    unsigned long irq_table_flags;
-    struct work_struct *work = NULL;
   -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
+    struct  list_head *handler_list =
&adev->dm.irq_handler_list_low_tab[irq_source];
+    struct  amdgpu_dm_irq_handler_data *handler_data;
+    bool    work_queued = false;
   -    if
(!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
-        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
+    if (list_empty(handler_list))
+        return;
   -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
+    list_for_each_entry(handler_data, handler_list, list) {
+
+        if (!queue_work(system_highpri_wq, &handler_data->work)) {
+            continue;
+        } else {
+            work_queued = true;
+            break;
+        }
+    }
+
+    if (!work_queued) {
+
+        struct  amdgpu_dm_irq_handler_data *handler_data_add;
+        /*get the amdgpu_dm_irq_handler_data of first item pointed
by handler_list*/
+        handler_data = container_of(handler_list->next, struct
amdgpu_dm_irq_handler_data, list);
+
+        /*allocate a new amdgpu_dm_irq_handler_data*/
+        handler_data_add = kzalloc(sizeof(*handler_data),
GFP_KERNEL);
+        if (!handler_data_add) {
+            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
+            return;
+        }
+
+        /*copy new amdgpu_dm_irq_handler_data members from
handler_data*/
+        handler_data_add->handler       = handler_data->handler;
+        handler_data_add->handler_arg   = handler_data->handler_arg;
+        handler_data_add->dm            = handler_data->dm;
+        handler_data_add->irq_source    = irq_source;
+
+        list_add_tail(&handler_data_add->list, handler_list);

At what place are you deleting completed work items from the
handler_list ?

Andrey

The new allocated work item handler_data_add is put at end of single
list handler_list. It is not deleted, but put into this list. In the
future for same interrupt source handling the previous allocated work
items can be reused.  So we do not have to reallocate new work items
if previous interrupts have not been handled by cpu. On other side
system will keep all the allocated work items at run time. Note, the
new work item allocation only happens when cpu has not handled
previous interrupts yet, and new same type interrupts have came.

Thanks

Xiaogang

I am still confused, how you avoid executing a second time a handler
you previously allocated because first queue_work failed,
you always start iteration from beginning of the list and you never
remove already successfully executed handlers.

Andrey


Not sure if understand your question. If the first time queuing work
item success there is no second time queuing the same work item. The
second work item is a different one although they use same
handle_hpd_rx_irq. The queued work items are managed by work
queue(queue_work). Work queue knows each queued work item status: still
on queue, running, or exist. I look from already allocated work items to
see if work queue allows me to queue, until find one. If all allocated
work items cannot be queued, allocate a new work item and put it at the
list of allocated work items.

Thanks

Xiaogang



+
+        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
   -    if (work) {
-        if (!schedule_work(work))
-            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
-                        irq_source);
+        if (queue_work(system_highpri_wq, &handler_data_add->work))
+            DRM_DEBUG("__func__: a work_struct is allocated and
queued, "
+                     "src %d\n", irq_source);
+        else
+            DRM_ERROR("__func__: a new work_struct cannot be
queued, "
+                      "something is wrong, src %d\n", irq_source);
       }
     }
Aurabindo Pillai Feb. 26, 2021, 10:52 p.m. UTC | #8
On 2021-01-22 3:55 p.m., Chen, Xiaogang wrote:
> On 1/19/2021 4:29 PM, Grodzovsky, Andrey wrote:
>>
>> On 1/15/21 2:21 AM, Chen, Xiaogang wrote:
>>> On 1/14/2021 1:24 AM, Grodzovsky, Andrey wrote:
>>>>
>>>> On 1/14/21 12:11 AM, Chen, Xiaogang wrote:
>>>>> On 1/12/2021 10:54 PM, Grodzovsky, Andrey wrote:
>>>>>> On 1/4/21 1:01 AM, Xiaogang.Chen wrote:
>>>>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>>>
>>>>>>> amdgpu DM handles INTERRUPT_LOW_IRQ_CONTEXT interrupt(hpd, 
>>>>>>> hpd_rx) by
>>>>>>> using work queue and uses single work_struct. If previous interrupt
>>>>>>> has not been handled new interrupts(same type) will be discarded and
>>>>>>> driver just sends "amdgpu_dm_irq_schedule_work FAILED" message out.
>>>>>>> If some important hpd, hpd_rx related interrupts are missed by 
>>>>>>> driver
>>>>>>> the hot (un)plug devices may cause system hang or unstable, such as
>>>>>>> system resumes from S3 sleep with mst device connected.
>>>>>>>
>>>>>>> This patch dynamically allocates new amdgpu_dm_irq_handler_data for
>>>>>>> new interrupts if previous INTERRUPT_LOW_IRQ_CONTEXT interrupt work
>>>>>>> has not been handled. So the new interrupt works can be queued to 
>>>>>>> the
>>>>>>> same workqueue_struct, instead discard the new interrupts.
>>>>>>> All allocated amdgpu_dm_irq_handler_data are put into a single 
>>>>>>> linked
>>>>>>> list and will be reused after.
>>>>>>
>>>>>> I believe this creates a possible concurrency between already
>>>>>> executing work item
>>>>>> and the new incoming one for which you allocate a new work item on
>>>>>> the fly. While
>>>>>> handle_hpd_irq is serialized with aconnector->hpd_lock I am seeing
>>>>>> that for handle_hpd_rx_irq
>>>>>> it's not locked for MST use case (which is the most frequently used
>>>>>> with this interrupt).  Did you
>>>>>> verified that handle_hpd_rx_irq is reentrant ?
>>>>>>
>>>>> handle_hpd_rx_irq is put at a work queue. Its execution is serialized
>>>>> by the work queue. So there is no reentrant.
>>>>>
>>>> You are using system_highpri_wq which has the property that it has
>>>> multiple workers thread pool spread across all the
>>>> active CPUs, see all work queue definitions here
>>>> https://elixir.bootlin.com/linux/v5.11-rc3/source/include/linux/workqueue.h#L358 
>>>>
>>>> I beleieve that what you saying about no chance of reentrnacy would be
>>>> correct if it would be same work item dequeued for execution
>>>> while previous instance is still running, see the explanation here -
>>>> https://elixir.bootlin.com/linux/v5.11-rc3/source/kernel/workqueue.c#L1435. 
>>>>
>>>> Non reentrancy is guaranteed only for the same work item. If you want
>>>> non reentrancy (full serializtion) for different work items you should
>>>> create
>>>> you own single threaded work-queue using create_singlethread_workqueue
>>>>
>>>>
>>> Thank you. I think the easiest way is using aconnector->hpd_lock at
>>> handle_hpd_rx_irq to lock for dc_link->type == dc_connection_mst_branch
>>> case, right? I will do that at next version if you think it is ok.
>>
>>
>> I am not sure what are the consequences of of using hpd lock there with
>> regard to other locks acquired in DRM MST code during MST related HPD 
>> transactions since
>> i haven't dealt with this for a very long time. Maybe Harry or Nick 
>> can advise on this ?
>>
>> Andrey
>>
> Hi Harry, Nick: would you or someone give review on this patch?
> 
> Thanks
> 
> Xiaogang
> 
>>>
>>>>> amdgpu_dm_irq_schedule_work does queuing of work(put
>>>>> handle_hpd_rx_irq into work queue). The first call is
>>>>> dm_irq_work_func, then call handle_hpd_rx_irq.
>>>>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>>> ---
>>>>>>>    drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |  14 +--
>>>>>>>    .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c  | 114
>>>>>>> ++++++++++++++-------
>>>>>>>    2 files changed, 80 insertions(+), 48 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>>>> index c9d82b9..730e540 100644
>>>>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
>>>>>>> @@ -69,18 +69,6 @@ struct common_irq_params {
>>>>>>>    };
>>>>>>>      /**
>>>>>>> - * struct irq_list_head - Linked-list for low context IRQ handlers.
>>>>>>> - *
>>>>>>> - * @head: The list_head within &struct handler_data
>>>>>>> - * @work: A work_struct containing the deferred handler work
>>>>>>> - */
>>>>>>> -struct irq_list_head {
>>>>>>> -    struct list_head head;
>>>>>>> -    /* In case this interrupt needs post-processing, 'work' will
>>>>>>> be queued*/
>>>>>>> -    struct work_struct work;
>>>>>>> -};
>>>>>>> -
>>>>>>> -/**
>>>>>>>     * struct dm_compressor_info - Buffer info used by frame buffer
>>>>>>> compression
>>>>>>>     * @cpu_addr: MMIO cpu addr
>>>>>>>     * @bo_ptr: Pointer to the buffer object
>>>>>>> @@ -270,7 +258,7 @@ struct amdgpu_display_manager {
>>>>>>>         * Note that handlers are called in the same order as they 
>>>>>>> were
>>>>>>>         * registered (FIFO).
>>>>>>>         */
>>>>>>> -    struct irq_list_head
>>>>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>>>>> +    struct list_head
>>>>>>> irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
>>>>>>>          /**
>>>>>>>         * @irq_handler_list_high_tab:
>>>>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>>>> index 3577785..ada344a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
>>>>>>> @@ -82,6 +82,7 @@ struct amdgpu_dm_irq_handler_data {
>>>>>>>        struct amdgpu_display_manager *dm;
>>>>>>>        /* DAL irq source which registered for this interrupt. */
>>>>>>>        enum dc_irq_source irq_source;
>>>>>>> +    struct work_struct work;
>>>>>>>    };
>>>>>>>      #define DM_IRQ_TABLE_LOCK(adev, flags) \
>>>>>>> @@ -111,20 +112,10 @@ static void init_handler_common_data(struct
>>>>>>> amdgpu_dm_irq_handler_data *hcd,
>>>>>>>     */
>>>>>>>    static void dm_irq_work_func(struct work_struct *work)
>>>>>>>    {
>>>>>>> -    struct irq_list_head *irq_list_head =
>>>>>>> -        container_of(work, struct irq_list_head, work);
>>>>>>> -    struct list_head *handler_list = &irq_list_head->head;
>>>>>>> -    struct amdgpu_dm_irq_handler_data *handler_data;
>>>>>>> -
>>>>>>> -    list_for_each_entry(handler_data, handler_list, list) {
>>>>>>> -        DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
>>>>>>> -                handler_data->irq_source);
>>>>>>> +    struct amdgpu_dm_irq_handler_data *handler_data =
>>>>>>> +     container_of(work, struct amdgpu_dm_irq_handler_data, work);
>>>>>>>    -        DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
>>>>>>> -            handler_data->irq_source);
>>>>>>> -
>>>>>>> - handler_data->handler(handler_data->handler_arg);
>>>>>>> -    }
>>>>>>> +    handler_data->handler(handler_data->handler_arg);
>>>>>>>          /* Call a DAL subcomponent which registered for interrupt
>>>>>>> notification
>>>>>>>         * at INTERRUPT_LOW_IRQ_CONTEXT.
>>>>>>> @@ -156,7 +147,7 @@ static struct list_head
>>>>>>> *remove_irq_handler(struct amdgpu_device *adev,
>>>>>>>            break;
>>>>>>>        case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>>>>        default:
>>>>>>> -        hnd_list =
>>>>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>>>>            break;
>>>>>>>        }
>>>>>>>    @@ -287,7 +278,8 @@ void *amdgpu_dm_irq_register_interrupt(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>            break;
>>>>>>>        case INTERRUPT_LOW_IRQ_CONTEXT:
>>>>>>>        default:
>>>>>>> -        hnd_list =
>>>>>>> &adev->dm.irq_handler_list_low_tab[irq_source].head;
>>>>>>> +        hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>>>> +        INIT_WORK(&handler_data->work, dm_irq_work_func);
>>>>>>>            break;
>>>>>>>        }
>>>>>>>    @@ -369,7 +361,7 @@ void
>>>>>>> amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
>>>>>>>    int amdgpu_dm_irq_init(struct amdgpu_device *adev)
>>>>>>>    {
>>>>>>>        int src;
>>>>>>> -    struct irq_list_head *lh;
>>>>>>> +    struct list_head *lh;
>>>>>>>          DRM_DEBUG_KMS("DM_IRQ\n");
>>>>>>>    @@ -378,9 +370,7 @@ int amdgpu_dm_irq_init(struct amdgpu_device
>>>>>>> *adev)
>>>>>>>        for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>>>>            /* low context handler list init */
>>>>>>>            lh = &adev->dm.irq_handler_list_low_tab[src];
>>>>>>> -        INIT_LIST_HEAD(&lh->head);
>>>>>>> -        INIT_WORK(&lh->work, dm_irq_work_func);
>>>>>>> -
>>>>>>> +        INIT_LIST_HEAD(lh);
>>>>>>>            /* high context handler init */
>>>>>>> INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
>>>>>>>        }
>>>>>>> @@ -397,8 +387,11 @@ int amdgpu_dm_irq_init(struct amdgpu_device
>>>>>>> *adev)
>>>>>>>    void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
>>>>>>>    {
>>>>>>>        int src;
>>>>>>> -    struct irq_list_head *lh;
>>>>>>> +    struct list_head *lh;
>>>>>>> +    struct list_head *entry, *tmp;
>>>>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>>>>        unsigned long irq_table_flags;
>>>>>>> +
>>>>>>>        DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
>>>>>>>        for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
>>>>>>>            DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>>> @@ -407,7 +400,15 @@ void amdgpu_dm_irq_fini(struct amdgpu_device
>>>>>>> *adev)
>>>>>>>             * (because no code can schedule a new one). */
>>>>>>>            lh = &adev->dm.irq_handler_list_low_tab[src];
>>>>>>>            DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>>>> -        flush_work(&lh->work);
>>>>>>> +
>>>>>>> +        if (!list_empty(lh)) {
>>>>>>> +            list_for_each_safe(entry, tmp, lh) {
>>>>>>> +

Extra newline not necessary.

>>>>>>> +                handler = list_entry(entry, struct
>>>>>>> amdgpu_dm_irq_handler_data,
>>>>>>> +                                     list);
>>>>>>> +                flush_work(&handler->work);
>>>>>>> +            }
>>>>>>> +        }
>>>>>>>        }
>>>>>>>    }
>>>>>>>    @@ -417,6 +418,8 @@ int amdgpu_dm_irq_suspend(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>        struct list_head *hnd_list_h;
>>>>>>>        struct list_head *hnd_list_l;
>>>>>>>        unsigned long irq_table_flags;
>>>>>>> +    struct list_head *entry, *tmp;
>>>>>>> +    struct amdgpu_dm_irq_handler_data *handler;
>>>>>>>          DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>>>    @@ -427,14 +430,22 @@ int amdgpu_dm_irq_suspend(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>         * will be disabled from manage_dm_interrupts on disable 
>>>>>>> CRTC.
>>>>>>>         */
>>>>>>>        for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX;
>>>>>>> src++) {
>>>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>>>                dc_interrupt_set(adev->dm.dc, src, false);
>>>>>>>              DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>>>> - flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
>>>>>>>    +        if (!list_empty(hnd_list_l)) {
>>>>>>> +

Extra newline

>>>>>>> +            list_for_each_safe(entry, tmp, hnd_list_l) {
>>>>>>> +
>>>>>>> +                handler = list_entry(entry, struct
>>>>>>> amdgpu_dm_irq_handler_data,
>>>>>>> +                                     list);
>>>>>>> +                flush_work(&handler->work);
>>>>>>> +            }
>>>>>>> +        }
>>>>>>>            DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>>>        }
>>>>>>>    @@ -454,7 +465,7 @@ int amdgpu_dm_irq_resume_early(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>          /* re-enable short pulse interrupts HW interrupt */
>>>>>>>        for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX;
>>>>>>> src++) {
>>>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>>>                dc_interrupt_set(adev->dm.dc, src, true);
>>>>>>> @@ -480,7 +491,7 @@ int amdgpu_dm_irq_resume_late(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>         * will be enabled from manage_dm_interrupts on enable CRTC.
>>>>>>>         */
>>>>>>>        for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6;
>>>>>>> src++) {
>>>>>>> -        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
>>>>>>> +        hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
>>>>>>>            hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
>>>>>>>            if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
>>>>>>>                dc_interrupt_set(adev->dm.dc, src, true);
>>>>>>> @@ -497,20 +508,53 @@ int amdgpu_dm_irq_resume_late(struct
>>>>>>> amdgpu_device *adev)
>>>>>>>    static void amdgpu_dm_irq_schedule_work(struct amdgpu_device 
>>>>>>> *adev,
>>>>>>>                        enum dc_irq_source irq_source)
>>>>>>>    {
>>>>>>> -    unsigned long irq_table_flags;
>>>>>>> -    struct work_struct *work = NULL;
>>>>>>>    -    DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
>>>>>>> +    struct  list_head *handler_list =
>>>>>>> &adev->dm.irq_handler_list_low_tab[irq_source];
>>>>>>> +    struct  amdgpu_dm_irq_handler_data *handler_data;
>>>>>>> +    bool    work_queued = false;
>>>>>>>    -    if
>>>>>>> (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
>>>>>>> -        work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
>>>>>>> +    if (list_empty(handler_list))
>>>>>>> +        return;
>>>>>>>    -    DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
>>>>>>> +    list_for_each_entry(handler_data, handler_list, list) {
>>>>>>> +
>>>>>>> +        if (!queue_work(system_highpri_wq, &handler_data->work)) {
>>>>>>> +            continue;
>>>>>>> +        } else {
>>>>>>> +            work_queued = true;
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (!work_queued) {
>>>>>>> +
>>>>>>> +        struct  amdgpu_dm_irq_handler_data *handler_data_add;
>>>>>>> +        /*get the amdgpu_dm_irq_handler_data of first item pointed
>>>>>>> by handler_list*/
>>>>>>> +        handler_data = container_of(handler_list->next, struct
>>>>>>> amdgpu_dm_irq_handler_data, list);
>>>>>>> +
>>>>>>> +        /*allocate a new amdgpu_dm_irq_handler_data*/
>>>>>>> +        handler_data_add = kzalloc(sizeof(*handler_data),
>>>>>>> GFP_KERNEL);
>>>>>>> +        if (!handler_data_add) {
>>>>>>> +            DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
>>>>>>> +            return;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        /*copy new amdgpu_dm_irq_handler_data members from
>>>>>>> handler_data*/
>>>>>>> +        handler_data_add->handler       = handler_data->handler;
>>>>>>> +        handler_data_add->handler_arg   = 
>>>>>>> handler_data->handler_arg;
>>>>>>> +        handler_data_add->dm            = handler_data->dm;
>>>>>>> +        handler_data_add->irq_source    = irq_source;
>>>>>>> +
>>>>>>> +        list_add_tail(&handler_data_add->list, handler_list);
>>>>>>
>>>>>> At what place are you deleting completed work items from the
>>>>>> handler_list ?
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>> The new allocated work item handler_data_add is put at end of single
>>>>> list handler_list. It is not deleted, but put into this list. In the
>>>>> future for same interrupt source handling the previous allocated work
>>>>> items can be reused.  So we do not have to reallocate new work items
>>>>> if previous interrupts have not been handled by cpu. On other side
>>>>> system will keep all the allocated work items at run time. Note, the
>>>>> new work item allocation only happens when cpu has not handled
>>>>> previous interrupts yet, and new same type interrupts have came.
>>>>>
>>>>> Thanks
>>>>>
>>>>> Xiaogang
>>>>>
>>>> I am still confused, how you avoid executing a second time a handler
>>>> you previously allocated because first queue_work failed,
>>>> you always start iteration from beginning of the list and you never
>>>> remove already successfully executed handlers.
>>>>
>>>> Andrey
>>>>
>>>>
>>> Not sure if understand your question. If the first time queuing work
>>> item success there is no second time queuing the same work item. The
>>> second work item is a different one although they use same
>>> handle_hpd_rx_irq. The queued work items are managed by work
>>> queue(queue_work). Work queue knows each queued work item status: still
>>> on queue, running, or exist. I look from already allocated work items to
>>> see if work queue allows me to queue, until find one. If all allocated
>>> work items cannot be queued, allocate a new work item and put it at the
>>> list of allocated work items.
>>>
>>> Thanks
>>>
>>> Xiaogang
>>>
>>>
>>>
>>>>>>> +
>>>>>>> +        INIT_WORK(&handler_data_add->work, dm_irq_work_func);
>>>>>>>    -    if (work) {
>>>>>>> -        if (!schedule_work(work))
>>>>>>> -            DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
>>>>>>> -                        irq_source);
>>>>>>> +        if (queue_work(system_highpri_wq, &handler_data_add->work))
>>>>>>> +            DRM_DEBUG("__func__: a work_struct is allocated and
>>>>>>> queued, "
>>>>>>> +                     "src %d\n", irq_source);
>>>>>>> +        else
>>>>>>> +            DRM_ERROR("__func__: a new work_struct cannot be
>>>>>>> queued, "
>>>>>>> +                      "something is wrong, src %d\n", irq_source);

A better error message that points to the failure would be nice, 
something along the lines of  "Failed to queue work for handling 
interrupt request from display"

With the comments addressed, this series is:

Reviewed-by: Aurabindo Pillai <aurabindo.pillai at amd.com>

>>>>>>>        }
>>>>>>>      }
>>>>>
> 
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index c9d82b9..730e540 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -69,18 +69,6 @@  struct common_irq_params {
 };
 
 /**
- * struct irq_list_head - Linked-list for low context IRQ handlers.
- *
- * @head: The list_head within &struct handler_data
- * @work: A work_struct containing the deferred handler work
- */
-struct irq_list_head {
-	struct list_head head;
-	/* In case this interrupt needs post-processing, 'work' will be queued*/
-	struct work_struct work;
-};
-
-/**
  * struct dm_compressor_info - Buffer info used by frame buffer compression
  * @cpu_addr: MMIO cpu addr
  * @bo_ptr: Pointer to the buffer object
@@ -270,7 +258,7 @@  struct amdgpu_display_manager {
 	 * Note that handlers are called in the same order as they were
 	 * registered (FIFO).
 	 */
-	struct irq_list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
+	struct list_head irq_handler_list_low_tab[DAL_IRQ_SOURCES_NUMBER];
 
 	/**
 	 * @irq_handler_list_high_tab:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
index 3577785..ada344a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
@@ -82,6 +82,7 @@  struct amdgpu_dm_irq_handler_data {
 	struct amdgpu_display_manager *dm;
 	/* DAL irq source which registered for this interrupt. */
 	enum dc_irq_source irq_source;
+	struct work_struct work;
 };
 
 #define DM_IRQ_TABLE_LOCK(adev, flags) \
@@ -111,20 +112,10 @@  static void init_handler_common_data(struct amdgpu_dm_irq_handler_data *hcd,
  */
 static void dm_irq_work_func(struct work_struct *work)
 {
-	struct irq_list_head *irq_list_head =
-		container_of(work, struct irq_list_head, work);
-	struct list_head *handler_list = &irq_list_head->head;
-	struct amdgpu_dm_irq_handler_data *handler_data;
-
-	list_for_each_entry(handler_data, handler_list, list) {
-		DRM_DEBUG_KMS("DM_IRQ: work_func: for dal_src=%d\n",
-				handler_data->irq_source);
+	struct amdgpu_dm_irq_handler_data *handler_data =
+	 container_of(work, struct amdgpu_dm_irq_handler_data, work);
 
-		DRM_DEBUG_KMS("DM_IRQ: schedule_work: for dal_src=%d\n",
-			handler_data->irq_source);
-
-		handler_data->handler(handler_data->handler_arg);
-	}
+	handler_data->handler(handler_data->handler_arg);
 
 	/* Call a DAL subcomponent which registered for interrupt notification
 	 * at INTERRUPT_LOW_IRQ_CONTEXT.
@@ -156,7 +147,7 @@  static struct list_head *remove_irq_handler(struct amdgpu_device *adev,
 		break;
 	case INTERRUPT_LOW_IRQ_CONTEXT:
 	default:
-		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
 		break;
 	}
 
@@ -287,7 +278,8 @@  void *amdgpu_dm_irq_register_interrupt(struct amdgpu_device *adev,
 		break;
 	case INTERRUPT_LOW_IRQ_CONTEXT:
 	default:
-		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source].head;
+		hnd_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+		INIT_WORK(&handler_data->work, dm_irq_work_func);
 		break;
 	}
 
@@ -369,7 +361,7 @@  void amdgpu_dm_irq_unregister_interrupt(struct amdgpu_device *adev,
 int amdgpu_dm_irq_init(struct amdgpu_device *adev)
 {
 	int src;
-	struct irq_list_head *lh;
+	struct list_head *lh;
 
 	DRM_DEBUG_KMS("DM_IRQ\n");
 
@@ -378,9 +370,7 @@  int amdgpu_dm_irq_init(struct amdgpu_device *adev)
 	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
 		/* low context handler list init */
 		lh = &adev->dm.irq_handler_list_low_tab[src];
-		INIT_LIST_HEAD(&lh->head);
-		INIT_WORK(&lh->work, dm_irq_work_func);
-
+		INIT_LIST_HEAD(lh);
 		/* high context handler init */
 		INIT_LIST_HEAD(&adev->dm.irq_handler_list_high_tab[src]);
 	}
@@ -397,8 +387,11 @@  int amdgpu_dm_irq_init(struct amdgpu_device *adev)
 void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
 {
 	int src;
-	struct irq_list_head *lh;
+	struct list_head *lh;
+	struct list_head *entry, *tmp;
+	struct amdgpu_dm_irq_handler_data *handler;
 	unsigned long irq_table_flags;
+
 	DRM_DEBUG_KMS("DM_IRQ: releasing resources.\n");
 	for (src = 0; src < DAL_IRQ_SOURCES_NUMBER; src++) {
 		DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
@@ -407,7 +400,15 @@  void amdgpu_dm_irq_fini(struct amdgpu_device *adev)
 		 * (because no code can schedule a new one). */
 		lh = &adev->dm.irq_handler_list_low_tab[src];
 		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-		flush_work(&lh->work);
+
+		if (!list_empty(lh)) {
+			list_for_each_safe(entry, tmp, lh) {
+
+				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+									 list);
+				flush_work(&handler->work);
+			}
+		}
 	}
 }
 
@@ -417,6 +418,8 @@  int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
 	struct list_head *hnd_list_h;
 	struct list_head *hnd_list_l;
 	unsigned long irq_table_flags;
+	struct list_head *entry, *tmp;
+	struct amdgpu_dm_irq_handler_data *handler;
 
 	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
 
@@ -427,14 +430,22 @@  int amdgpu_dm_irq_suspend(struct amdgpu_device *adev)
 	 * will be disabled from manage_dm_interrupts on disable CRTC.
 	 */
 	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, false);
 
 		DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
-		flush_work(&adev->dm.irq_handler_list_low_tab[src].work);
 
+		if (!list_empty(hnd_list_l)) {
+
+			list_for_each_safe(entry, tmp, hnd_list_l) {
+
+				handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
+									 list);
+				flush_work(&handler->work);
+			}
+		}
 		DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
 	}
 
@@ -454,7 +465,7 @@  int amdgpu_dm_irq_resume_early(struct amdgpu_device *adev)
 
 	/* re-enable short pulse interrupts HW interrupt */
 	for (src = DC_IRQ_SOURCE_HPD1RX; src <= DC_IRQ_SOURCE_HPD6RX; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, true);
@@ -480,7 +491,7 @@  int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
 	 * will be enabled from manage_dm_interrupts on enable CRTC.
 	 */
 	for (src = DC_IRQ_SOURCE_HPD1; src <= DC_IRQ_SOURCE_HPD6; src++) {
-		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src].head;
+		hnd_list_l = &adev->dm.irq_handler_list_low_tab[src];
 		hnd_list_h = &adev->dm.irq_handler_list_high_tab[src];
 		if (!list_empty(hnd_list_l) || !list_empty(hnd_list_h))
 			dc_interrupt_set(adev->dm.dc, src, true);
@@ -497,20 +508,53 @@  int amdgpu_dm_irq_resume_late(struct amdgpu_device *adev)
 static void amdgpu_dm_irq_schedule_work(struct amdgpu_device *adev,
 					enum dc_irq_source irq_source)
 {
-	unsigned long irq_table_flags;
-	struct work_struct *work = NULL;
 
-	DM_IRQ_TABLE_LOCK(adev, irq_table_flags);
+	struct  list_head *handler_list = &adev->dm.irq_handler_list_low_tab[irq_source];
+	struct  amdgpu_dm_irq_handler_data *handler_data;
+	bool    work_queued = false;
 
-	if (!list_empty(&adev->dm.irq_handler_list_low_tab[irq_source].head))
-		work = &adev->dm.irq_handler_list_low_tab[irq_source].work;
+	if (list_empty(handler_list))
+		return;
 
-	DM_IRQ_TABLE_UNLOCK(adev, irq_table_flags);
+	list_for_each_entry(handler_data, handler_list, list) {
+
+		if (!queue_work(system_highpri_wq, &handler_data->work)) {
+			continue;
+		} else {
+			work_queued = true;
+			break;
+		}
+	}
+
+	if (!work_queued) {
+
+		struct  amdgpu_dm_irq_handler_data *handler_data_add;
+		/*get the amdgpu_dm_irq_handler_data of first item pointed by handler_list*/
+		handler_data = container_of(handler_list->next, struct amdgpu_dm_irq_handler_data, list);
+
+		/*allocate a new amdgpu_dm_irq_handler_data*/
+		handler_data_add = kzalloc(sizeof(*handler_data), GFP_KERNEL);
+		if (!handler_data_add) {
+			DRM_ERROR("DM_IRQ: failed to allocate irq handler!\n");
+			return;
+		}
+
+		/*copy new amdgpu_dm_irq_handler_data members from handler_data*/
+		handler_data_add->handler       = handler_data->handler;
+		handler_data_add->handler_arg   = handler_data->handler_arg;
+		handler_data_add->dm            = handler_data->dm;
+		handler_data_add->irq_source    = irq_source;
+
+		list_add_tail(&handler_data_add->list, handler_list);
+
+		INIT_WORK(&handler_data_add->work, dm_irq_work_func);
 
-	if (work) {
-		if (!schedule_work(work))
-			DRM_INFO("amdgpu_dm_irq_schedule_work FAILED src %d\n",
-						irq_source);
+		if (queue_work(system_highpri_wq, &handler_data_add->work))
+			DRM_DEBUG("__func__: a work_struct is allocated and queued, "
+					 "src %d\n", irq_source);
+		else
+			DRM_ERROR("__func__: a new work_struct cannot be queued, "
+					  "something is wrong, src %d\n", irq_source);
 	}
 
 }