diff mbox series

[RFC,1/4] ACPI: APEI: Add support to notify the vendor specific HW errors

Message ID 20190812101149.26036-2-shiju.jose@huawei.com (mailing list archive)
State RFC, archived
Headers show
Series ACPI: APEI: Add support to notify the vendor specific HW errors | expand

Commit Message

Shiju Jose Aug. 12, 2019, 10:11 a.m. UTC
Presently the vendor specific HW errors, in the non-standard format,
are not reported to the vendor drivers for the recovery.

This patch adds support to notify the vendor specific HW errors to the
registered kernel drivers.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/acpi/apei/ghes.c | 118 +++++++++++++++++++++++++++++++++++++++++++++--
 include/acpi/ghes.h      |  47 +++++++++++++++++++
 2 files changed, 160 insertions(+), 5 deletions(-)

Comments

James Morse Aug. 21, 2019, 5:23 p.m. UTC | #1
Hi,

On 12/08/2019 11:11, Shiju Jose wrote:
> Presently the vendor specific HW errors, in the non-standard format,
> are not reported to the vendor drivers for the recovery.
> 
> This patch adds support to notify the vendor specific HW errors to the
> registered kernel drivers.

> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index a66e00f..374d197 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -477,6 +477,77 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
>  #endif
>  }
>  
> +struct ghes_error_notify {
> +	struct list_head list;> +	struct rcu_head	rcu_head;
> +	guid_t sec_type; /* guid of the error record */

> +	error_handle handle; /* error handler function */

ghes_error_handler_t error_handler; ?


> +	void *data; /* handler driver's private data if any */
> +};
> +
> +/* List to store the registered error handling functions */
> +static DEFINE_MUTEX(ghes_error_notify_mutex);
> +static LIST_HEAD(ghes_error_notify_list);

> +static refcount_t ghes_ref_count;

I don't think this refcount is needed.


> +/**
> + * ghes_error_notify_register - register an error handling function
> + * for the hw errors.
> + * @sec_type: sec_type of the corresponding CPER to be notified.
> + * @handle: pointer to the error handling function.
> + * @data: handler driver's private data.
> + *
> + * return 0 : SUCCESS, non-zero : FAIL
> + */
> +int ghes_error_notify_register(guid_t sec_type, error_handle handle, void *data)
> +{
> +	struct ghes_error_notify *err_notify;
> +
> +	mutex_lock(&ghes_error_notify_mutex);
> +	err_notify = kzalloc(sizeof(*err_notify), GFP_KERNEL);
> +	if (!err_notify)
> +		return -ENOMEM;

Leaving the mutex locked.
You may as well allocate the memory before taking the lock.


> +
> +	err_notify->handle = handle;
> +	guid_copy(&err_notify->sec_type, &sec_type);
> +	err_notify->data = data;
> +	list_add_rcu(&err_notify->list, &ghes_error_notify_list);
> +	mutex_unlock(&ghes_error_notify_mutex);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(ghes_error_notify_register);

Could we leave exporting this to modules until there is a user?


> +/**
> + * ghes_error_notify_unregister - unregister an error handling function.
> + * @sec_type: sec_type of the corresponding CPER.
> + * @handle: pointer to the error handling function.
> + *
> + * return none.
> + */
> +void ghes_error_notify_unregister(guid_t sec_type, error_handle handle)

Why do we need the handle(r) a second time? Surely there can only be one callback for a
given guid.


> +{
> +	struct ghes_error_notify *err_notify;
> +	bool found = 0;
> +
> +	mutex_lock(&ghes_error_notify_mutex);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(err_notify, &ghes_error_notify_list, list) {
> +		if (guid_equal(&err_notify->sec_type, &sec_type) &&
> +		    err_notify->handle == handle) {
> +			list_del_rcu(&err_notify->list);
> +			found = 1;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();

> +	synchronize_rcu();

Is this for the kfree()? Please keep them together so its obvious what its for.
Putting it outside the mutex will also save any contended waiter some time.


> +	mutex_unlock(&ghes_error_notify_mutex);
> +	if (found)
> +		kfree(err_notify);
> +}
> +EXPORT_SYMBOL_GPL(ghes_error_notify_unregister);
> +

>  static void ghes_do_proc(struct ghes *ghes,
>  			 const struct acpi_hest_generic_status *estatus)
>  {> @@ -512,11 +585,29 @@ static void ghes_do_proc(struct ghes *ghes,
>  
>  			log_arm_hw_error(err);
>  		} else {
> -			void *err = acpi_hest_get_payload(gdata);
> -
> -			log_non_standard_event(sec_type, fru_id, fru_text,
> -					       sec_sev, err,
> -					       gdata->error_data_length);

> +			rcu_read_lock();
> +			list_for_each_entry_rcu(err_notify,
> +						&ghes_error_notify_list, list) {
> +				if (guid_equal(&err_notify->sec_type,
> +					       sec_type)) {

> +					/* The notification is called in the
> +					 * interrupt context, thus the handler
> +					 * functions should be take care of it.
> +					 */

I read this as "the handler will be called", which doesn't seem to be a useful comment.


> +					err_notify->handle(gdata, sev,
> +							   err_notify->data);
> +					is_notify = 1;

					break;

> +				}
> +			}
> +			rcu_read_unlock();

> +			if (!is_notify) {

if (!found) Seems more natural.


> +				void *err = acpi_hest_get_payload(gdata);
> +
> +				log_non_standard_event(sec_type, fru_id,
> +						       fru_text, sec_sev, err,
> +						       gdata->error_data_length);
> +			}

This is tricky to read as its so bunched up. Please pull it out into a separate function.
ghes_handle_non_standard_event() ?


Because you skip log_non_standard_event(), rasdaemon will no longer see these in
user-space. For any kernel consumer of these, we need to know we aren't breaking the
user-space component.


>  		}
>  	}
>  }
> @@ -1217,6 +1308,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
>  
>  	ghes_edac_register(ghes, &ghes_dev->dev);
>  
> +	if (!refcount_read(&ghes_ref_count))
> +		refcount_set(&ghes_ref_count, 1);

What stops this from racing with itself if two ghes platform devices are probed at the
same time?

If the refcount needs initialising, please do it in ghes_init()....

> +	else
> +		refcount_inc(&ghes_ref_count);

.. but I don't think this refcount is needed.


>  	/* Handle any pending errors right away */
>  	spin_lock_irqsave(&ghes_notify_lock_irq, flags);
>  	ghes_proc(ghes);

> @@ -1279,6 +1376,17 @@ static int ghes_remove(struct platform_device *ghes_dev)
>  
>  	ghes_fini(ghes);
>  
> +	if (refcount_dec_and_test(&ghes_ref_count) &&
> +	    !list_empty(&ghes_error_notify_list)) {
> +		mutex_lock(&ghes_error_notify_mutex);> +		list_for_each_entry_safe(err_notify, tmp,
> +					 &ghes_error_notify_list, list) {
> +			list_del_rcu(&err_notify->list);
> +			kfree_rcu(err_notify, rcu_head);
> +		}
> +		mutex_unlock(&ghes_error_notify_mutex);
> +	}

... If someone unregisters, and re-registers all the GHES platform devices, the last one
out flushes the vendor-specific error handlers away. Then we re-probe the devices again,
but this time the vendor-specific error handlers don't work.

As you have an add/remove API for drivers, its up to drivers to cleanup when they are
removed. The comings and goings of GHES platform devices isn't relevant.


>  	ghes_edac_unregister(ghes);
>  
>  	kfree(ghes);
> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
> index e3f1cdd..d480537 100644
> --- a/include/acpi/ghes.h
> +++ b/include/acpi/ghes.h
> @@ -50,6 +50,53 @@ enum {
>  	GHES_SEV_PANIC = 0x3,
>  };
>  
> +/**
> + * error_handle - error handling function for the hw errors.

Fatal errors get dealt with earlier, so drivers will never see them.
| error handling function for non-fatal hardware errors.


> + * This handle function is called in the interrupt context.

As this overrides ghes's logging of the error, we should mention:
| The handler is responsible for any logging of the error.


> + * @gdata: acpi_hest_generic_data.
> + * @sev: error severity of the entire error event defined in the
> + * ACPI spec table generic error status block.
> + * @data: handler driver's private data.
> + *
> + * return : none.
> + */
> +typedef void (*error_handle)(struct acpi_hest_generic_data *gdata, int sev,
> +			     void *data);


Thanks,

James
Shiju Jose Aug. 22, 2019, 4:57 p.m. UTC | #2
Hi James,

>-----Original Message-----
>From: James Morse [mailto:james.morse@arm.com]
>Sent: 21 August 2019 18:24
>To: Shiju Jose <shiju.jose@huawei.com>
>Cc: linux-acpi@vger.kernel.org; linux-edac@vger.kernel.org; linux-
>kernel@vger.kernel.org; rjw@rjwysocki.net; lenb@kernel.org;
>tony.luck@intel.com; bp@alien8.de; baicar@os.amperecomputing.com;
>Linuxarm <linuxarm@huawei.com>; Jonathan Cameron
><jonathan.cameron@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>
>Subject: Re: [PATCH RFC 1/4] ACPI: APEI: Add support to notify the vendor
>specific HW errors
>
>Hi,
>
>On 12/08/2019 11:11, Shiju Jose wrote:
>> Presently the vendor specific HW errors, in the non-standard format,
>> are not reported to the vendor drivers for the recovery.
>>
>> This patch adds support to notify the vendor specific HW errors to the
>> registered kernel drivers.
>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>> a66e00f..374d197 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -477,6 +477,77 @@ static void ghes_handle_aer(struct
>> acpi_hest_generic_data *gdata)  #endif  }
>>
>> +struct ghes_error_notify {
>> +	struct list_head list;> +	struct rcu_head	rcu_head;
>> +	guid_t sec_type; /* guid of the error record */
>
>> +	error_handle handle; /* error handler function */
>
>ghes_error_handler_t error_handler; ?
Sure.

>
>
>> +	void *data; /* handler driver's private data if any */ };
>> +
>> +/* List to store the registered error handling functions */ static
>> +DEFINE_MUTEX(ghes_error_notify_mutex);
>> +static LIST_HEAD(ghes_error_notify_list);
>
>> +static refcount_t ghes_ref_count;
>
>I don't think this refcount is needed.
refcount was added to register standard error handlers with this notification method one time when
multiple ghes platform devices are probed.
 
>
>
>> +/**
>> + * ghes_error_notify_register - register an error handling function
>> + * for the hw errors.
>> + * @sec_type: sec_type of the corresponding CPER to be notified.
>> + * @handle: pointer to the error handling function.
>> + * @data: handler driver's private data.
>> + *
>> + * return 0 : SUCCESS, non-zero : FAIL  */ int
>> +ghes_error_notify_register(guid_t sec_type, error_handle handle, void
>> +*data) {
>> +	struct ghes_error_notify *err_notify;
>> +
>> +	mutex_lock(&ghes_error_notify_mutex);
>> +	err_notify = kzalloc(sizeof(*err_notify), GFP_KERNEL);
>> +	if (!err_notify)
>> +		return -ENOMEM;
>
>Leaving the mutex locked.
>You may as well allocate the memory before taking the lock.
Good spot. I will fix.

>
>
>> +
>> +	err_notify->handle = handle;
>> +	guid_copy(&err_notify->sec_type, &sec_type);
>> +	err_notify->data = data;
>> +	list_add_rcu(&err_notify->list, &ghes_error_notify_list);
>> +	mutex_unlock(&ghes_error_notify_mutex);
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(ghes_error_notify_register);
>
>Could we leave exporting this to modules until there is a user?
>
>
>> +/**
>> + * ghes_error_notify_unregister - unregister an error handling function.
>> + * @sec_type: sec_type of the corresponding CPER.
>> + * @handle: pointer to the error handling function.
>> + *
>> + * return none.
>> + */
>> +void ghes_error_notify_unregister(guid_t sec_type, error_handle
>> +handle)
>
>Why do we need the handle(r) a second time? Surely there can only be one
>callback for a given guid.
There is a possibility of sharing the guid between drivers if the non-standard error section format is common
for more than one devices if the error data to be reported is in the same format.
 
>
>
>> +{
>> +	struct ghes_error_notify *err_notify;
>> +	bool found = 0;
>> +
>> +	mutex_lock(&ghes_error_notify_mutex);
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(err_notify, &ghes_error_notify_list, list) {
>> +		if (guid_equal(&err_notify->sec_type, &sec_type) &&
>> +		    err_notify->handle == handle) {
>> +			list_del_rcu(&err_notify->list);
>> +			found = 1;
>> +			break;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>
>> +	synchronize_rcu();
>
>Is this for the kfree()? Please keep them together so its obvious what its for.
>Putting it outside the mutex will also save any contended waiter some time.
Yes. I will move synchronize_rcu () just before kfree. 
 
>
>
>> +	mutex_unlock(&ghes_error_notify_mutex);
>> +	if (found)
>> +		kfree(err_notify);
>> +}
>> +EXPORT_SYMBOL_GPL(ghes_error_notify_unregister);
>> +
>
>>  static void ghes_do_proc(struct ghes *ghes,
>>  			 const struct acpi_hest_generic_status *estatus)  {>
>@@ -512,11
>> +585,29 @@ static void ghes_do_proc(struct ghes *ghes,
>>
>>  			log_arm_hw_error(err);
>>  		} else {
>> -			void *err = acpi_hest_get_payload(gdata);
>> -
>> -			log_non_standard_event(sec_type, fru_id, fru_text,
>> -					       sec_sev, err,
>> -					       gdata->error_data_length);
>
>> +			rcu_read_lock();
>> +			list_for_each_entry_rcu(err_notify,
>> +						&ghes_error_notify_list, list) {
>> +				if (guid_equal(&err_notify->sec_type,
>> +					       sec_type)) {
>
>> +					/* The notification is called in the
>> +					 * interrupt context, thus the handler
>> +					 * functions should be take care of it.
>> +					 */
>
>I read this as "the handler will be called", which doesn't seem to be a useful
>comment.
Ok. I will correct the comment.
>
>
>> +					err_notify->handle(gdata, sev,
>> +							   err_notify->data);
>> +					is_notify = 1;
>
>					break;
>
>> +				}
>> +			}
>> +			rcu_read_unlock();
>
>> +			if (!is_notify) {
>
>if (!found) Seems more natural.
Ok. I will change to "is_notify"  to "found".

>
>
>> +				void *err = acpi_hest_get_payload(gdata);
>> +
>> +				log_non_standard_event(sec_type, fru_id,
>> +						       fru_text, sec_sev, err,
>> +						       gdata->error_data_length);
>> +			}
>
>This is tricky to read as its so bunched up. Please pull it out into a separate
>function.
>ghes_handle_non_standard_event() ?
Ok. I will add to new ghes_handle_non_standard_event() function.

>
>
>Because you skip log_non_standard_event(), rasdaemon will no longer see
>these in user-space. For any kernel consumer of these, we need to know we
>aren't breaking the user-space component.
>
>
>>  		}
>>  	}
>>  }
>> @@ -1217,6 +1308,11 @@ static int ghes_probe(struct platform_device
>> *ghes_dev)
>>
>>  	ghes_edac_register(ghes, &ghes_dev->dev);
>>
>> +	if (!refcount_read(&ghes_ref_count))
>> +		refcount_set(&ghes_ref_count, 1);
>
>What stops this from racing with itself if two ghes platform devices are probed
>at the same time?
yes. It is an issue.
>
>If the refcount needs initialising, please do it in ghes_init()....
refcount was added to register the standard error handlers to the notification
method only for the first time when the ghes device probed multiple times.
I will check is it possible to avoid using refcount by moving the above registration
of standard error handlers to the ghes_init().
>
>> +	else
>> +		refcount_inc(&ghes_ref_count);
>
>.. but I don't think this refcount is needed.
>
>
>>  	/* Handle any pending errors right away */
>>  	spin_lock_irqsave(&ghes_notify_lock_irq, flags);
>>  	ghes_proc(ghes);
>
>> @@ -1279,6 +1376,17 @@ static int ghes_remove(struct platform_device
>> *ghes_dev)
>>
>>  	ghes_fini(ghes);
>>
>> +	if (refcount_dec_and_test(&ghes_ref_count) &&
>> +	    !list_empty(&ghes_error_notify_list)) {
>> +		mutex_lock(&ghes_error_notify_mutex);> +
>	list_for_each_entry_safe(err_notify, tmp,
>> +					 &ghes_error_notify_list, list) {
>> +			list_del_rcu(&err_notify->list);
>> +			kfree_rcu(err_notify, rcu_head);
>> +		}
>> +		mutex_unlock(&ghes_error_notify_mutex);
>> +	}
>
>... If someone unregisters, and re-registers all the GHES platform devices, the
>last one out flushes the vendor-specific error handlers away. Then we re-probe
>the devices again, but this time the vendor-specific error handlers don't work.
>
>As you have an add/remove API for drivers, its up to drivers to cleanup when
>they are removed. The comings and goings of GHES platform devices isn't
>relevant.
Ok. Got it. I will either keep the unregister for the standard error handlers only
if the standard error handlers can be part of the notification method or remove completely
if the registration can be done in the ghes_init().    
>
>
>>  	ghes_edac_unregister(ghes);
>>
>>  	kfree(ghes);
>> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index
>> e3f1cdd..d480537 100644
>> --- a/include/acpi/ghes.h
>> +++ b/include/acpi/ghes.h
>> @@ -50,6 +50,53 @@ enum {
>>  	GHES_SEV_PANIC = 0x3,
>>  };
>>
>> +/**
>> + * error_handle - error handling function for the hw errors.
>
>Fatal errors get dealt with earlier, so drivers will never see them.
>| error handling function for non-fatal hardware errors.
Ok. I will change the comment as recoverable HW errors.
>
>
>> + * This handle function is called in the interrupt context.
>
>As this overrides ghes's logging of the error, we should mention:
>| The handler is responsible for any logging of the error.
Ok. I will add in the comment.
>
>
>> + * @gdata: acpi_hest_generic_data.
>> + * @sev: error severity of the entire error event defined in the
>> + * ACPI spec table generic error status block.
>> + * @data: handler driver's private data.
>> + *
>> + * return : none.
>> + */
>> +typedef void (*error_handle)(struct acpi_hest_generic_data *gdata, int sev,
>> +			     void *data);
>
>
>Thanks,
>
>James
Thanks,
Shiju
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a66e00f..374d197 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -477,6 +477,77 @@  static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+struct ghes_error_notify {
+	struct list_head list;
+	struct rcu_head	rcu_head;
+	guid_t sec_type; /* guid of the error record */
+	error_handle handle; /* error handler function */
+	void *data; /* handler driver's private data if any */
+};
+
+/* List to store the registered error handling functions */
+static DEFINE_MUTEX(ghes_error_notify_mutex);
+static LIST_HEAD(ghes_error_notify_list);
+static refcount_t ghes_ref_count;
+
+/**
+ * ghes_error_notify_register - register an error handling function
+ * for the hw errors.
+ * @sec_type: sec_type of the corresponding CPER to be notified.
+ * @handle: pointer to the error handling function.
+ * @data: handler driver's private data.
+ *
+ * return 0 : SUCCESS, non-zero : FAIL
+ */
+int ghes_error_notify_register(guid_t sec_type, error_handle handle, void *data)
+{
+	struct ghes_error_notify *err_notify;
+
+	mutex_lock(&ghes_error_notify_mutex);
+	err_notify = kzalloc(sizeof(*err_notify), GFP_KERNEL);
+	if (!err_notify)
+		return -ENOMEM;
+
+	err_notify->handle = handle;
+	guid_copy(&err_notify->sec_type, &sec_type);
+	err_notify->data = data;
+	list_add_rcu(&err_notify->list, &ghes_error_notify_list);
+	mutex_unlock(&ghes_error_notify_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ghes_error_notify_register);
+
+/**
+ * ghes_error_notify_unregister - unregister an error handling function.
+ * @sec_type: sec_type of the corresponding CPER.
+ * @handle: pointer to the error handling function.
+ *
+ * return none.
+ */
+void ghes_error_notify_unregister(guid_t sec_type, error_handle handle)
+{
+	struct ghes_error_notify *err_notify;
+	bool found = 0;
+
+	mutex_lock(&ghes_error_notify_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(err_notify, &ghes_error_notify_list, list) {
+		if (guid_equal(&err_notify->sec_type, &sec_type) &&
+		    err_notify->handle == handle) {
+			list_del_rcu(&err_notify->list);
+			found = 1;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	synchronize_rcu();
+	mutex_unlock(&ghes_error_notify_mutex);
+	if (found)
+		kfree(err_notify);
+}
+EXPORT_SYMBOL_GPL(ghes_error_notify_unregister);
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -485,6 +556,8 @@  static void ghes_do_proc(struct ghes *ghes,
 	guid_t *sec_type;
 	guid_t *fru_id = &NULL_UUID_LE;
 	char *fru_text = "";
+	bool is_notify = 0;
+	struct ghes_error_notify *err_notify;
 
 	sev = ghes_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
@@ -512,11 +585,29 @@  static void ghes_do_proc(struct ghes *ghes,
 
 			log_arm_hw_error(err);
 		} else {
-			void *err = acpi_hest_get_payload(gdata);
-
-			log_non_standard_event(sec_type, fru_id, fru_text,
-					       sec_sev, err,
-					       gdata->error_data_length);
+			rcu_read_lock();
+			list_for_each_entry_rcu(err_notify,
+						&ghes_error_notify_list, list) {
+				if (guid_equal(&err_notify->sec_type,
+					       sec_type)) {
+					/* The notification is called in the
+					 * interrupt context, thus the handler
+					 * functions should be take care of it.
+					 */
+					err_notify->handle(gdata, sev,
+							   err_notify->data);
+					is_notify = 1;
+				}
+			}
+			rcu_read_unlock();
+
+			if (!is_notify) {
+				void *err = acpi_hest_get_payload(gdata);
+
+				log_non_standard_event(sec_type, fru_id,
+						       fru_text, sec_sev, err,
+						       gdata->error_data_length);
+			}
 		}
 	}
 }
@@ -1217,6 +1308,11 @@  static int ghes_probe(struct platform_device *ghes_dev)
 
 	ghes_edac_register(ghes, &ghes_dev->dev);
 
+	if (!refcount_read(&ghes_ref_count))
+		refcount_set(&ghes_ref_count, 1);
+	else
+		refcount_inc(&ghes_ref_count);
+
 	/* Handle any pending errors right away */
 	spin_lock_irqsave(&ghes_notify_lock_irq, flags);
 	ghes_proc(ghes);
@@ -1237,6 +1333,7 @@  static int ghes_remove(struct platform_device *ghes_dev)
 	int rc;
 	struct ghes *ghes;
 	struct acpi_hest_generic *generic;
+	struct ghes_error_notify *err_notify, *tmp;
 
 	ghes = platform_get_drvdata(ghes_dev);
 	generic = ghes->generic;
@@ -1279,6 +1376,17 @@  static int ghes_remove(struct platform_device *ghes_dev)
 
 	ghes_fini(ghes);
 
+	if (refcount_dec_and_test(&ghes_ref_count) &&
+	    !list_empty(&ghes_error_notify_list)) {
+		mutex_lock(&ghes_error_notify_mutex);
+		list_for_each_entry_safe(err_notify, tmp,
+					 &ghes_error_notify_list, list) {
+			list_del_rcu(&err_notify->list);
+			kfree_rcu(err_notify, rcu_head);
+		}
+		mutex_unlock(&ghes_error_notify_mutex);
+	}
+
 	ghes_edac_unregister(ghes);
 
 	kfree(ghes);
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index e3f1cdd..d480537 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -50,6 +50,53 @@  enum {
 	GHES_SEV_PANIC = 0x3,
 };
 
+/**
+ * error_handle - error handling function for the hw errors.
+ * This handle function is called in the interrupt context.
+ * @gdata: acpi_hest_generic_data.
+ * @sev: error severity of the entire error event defined in the
+ * ACPI spec table generic error status block.
+ * @data: handler driver's private data.
+ *
+ * return : none.
+ */
+typedef void (*error_handle)(struct acpi_hest_generic_data *gdata, int sev,
+			     void *data);
+
+#ifdef CONFIG_ACPI_APEI_GHES
+/**
+ * ghes_error_notify_register - register an error handling function
+ * for the hw errors.
+ * @sec_type: sec_type of the corresponding CPER to be notified.
+ * @handle: pointer to the error handling function.
+ * @data: handler driver's private data.
+ *
+ * return : 0 - SUCCESS, non-zero - FAIL.
+ */
+int ghes_error_notify_register(guid_t sec_type, error_handle handle,
+			       void *data);
+
+/**
+ * ghes_error_notify_unregister - unregister an error handling function
+ * for the hw errors.
+ * @sec_type: sec_type of the corresponding CPER.
+ * @handle: pointer to the error handling function.
+ *
+ * return none.
+ */
+void ghes_error_notify_unregister(guid_t sec_type, error_handle handle);
+
+#else
+int ghes_error_notify_register(guid_t sec_type, error_handle handle, void *data)
+{
+	return -ENODEV;
+}
+
+void ghes_error_notify_unregister(guid_t sec_type, error_handle handle)
+{
+}
+#endif
+
 int ghes_estatus_pool_init(int num_ghes);
 
 /* From drivers/edac/ghes_edac.c */