diff mbox series

[v4,07/13] firmware: arm_scmi: Add notification dispatch and delivery

Message ID 20200304162558.48836-8-cristian.marussi@arm.com (mailing list archive)
State New, archived
Headers show
Series SCMI Notifications Core Support | expand

Commit Message

Cristian Marussi March 4, 2020, 4:25 p.m. UTC
Add core SCMI Notifications dispatch and delivery support logic which is
able, at first, to dispatch well-known received events from the RX ISR to
the dedicated deferred worker, and then, from there, to final deliver the
events to the registered users' callbacks.

Dispatch and delivery is just added here, still not enabled.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
V3 --> V4
- dispatcher now handles dequeuing of events in chunks (header+payload):
  handling of these in_flight events let us remove one unneeded memcpy
  on RX interrupt path (scmi_notify)
- deferred dispatcher now access their own per-protocol handlers' table
  reducing locking contention on the RX path
V2 --> V3
- exposing wq in sysfs via WQ_SYSFS
V1 --> V2
- splitted out of V1 patch 04
- moved from IDR maps to real HashTables to store event_handlers
- simplified delivery logic
---
 drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
 drivers/firmware/arm_scmi/notify.h |   9 +
 2 files changed, 342 insertions(+), 1 deletion(-)

Comments

Jonathan Cameron March 9, 2020, 12:26 p.m. UTC | #1
On Wed, 4 Mar 2020 16:25:52 +0000
Cristian Marussi <cristian.marussi@arm.com> wrote:

> Add core SCMI Notifications dispatch and delivery support logic which is
> able, at first, to dispatch well-known received events from the RX ISR to
> the dedicated deferred worker, and then, from there, to final deliver the
> events to the registered users' callbacks.
> 
> Dispatch and delivery is just added here, still not enabled.
> 
> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>

Hmm.  Doing that magic in_flight stuff looks fine, but it feels like
the wrong way to approach a problem which is down to the lack of
atomicity of the kfifo_in pair.   Could we just make that atomic via
a bit of custom manipulation of the kfifo?

The snag is that stuff isn't exported from the innards of kfifo...

Maybe what you have here is the best option.

Jonathan

> ---
> V3 --> V4
> - dispatcher now handles dequeuing of events in chunks (header+payload):
>   handling of these in_flight events let us remove one unneeded memcpy
>   on RX interrupt path (scmi_notify)
> - deferred dispatcher now access their own per-protocol handlers' table
>   reducing locking contention on the RX path
> V2 --> V3
> - exposing wq in sysfs via WQ_SYSFS
> V1 --> V2
> - splitted out of V1 patch 04
> - moved from IDR maps to real HashTables to store event_handlers
> - simplified delivery logic
> ---
>  drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>  drivers/firmware/arm_scmi/notify.h |   9 +
>  2 files changed, 342 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
> index d6c08cce3c63..0854d48d5886 100644
> --- a/drivers/firmware/arm_scmi/notify.c
> +++ b/drivers/firmware/arm_scmi/notify.c
> @@ -44,6 +44,27 @@
>   * as described in the SCMI Protocol specification, while src_id represents an
>   * optional, protocol dependent, source identifier (like domain_id, perf_id
>   * or sensor_id and so forth).
> + *
> + * Upon reception of a notification message from the platform the SCMI RX ISR
> + * passes the received message payload and some ancillary information (including
> + * an arrival timestamp in nanoseconds) to the core via @scmi_notify() which
> + * pushes the event-data itself on a protocol-dedicated kfifo queue for further
> + * deferred processing as specified in @scmi_events_dispatcher().
> + *
> + * Each protocol has it own dedicated work_struct and worker which, once kicked
> + * by the ISR, takes care to empty its own dedicated queue, deliverying the
> + * queued items into the proper notification-chain: notifications processing can
> + * proceed concurrently on distinct workers only between events belonging to
> + * different protocols while delivery of events within the same protocol is
> + * still strictly sequentially ordered by time of arrival.
> + *
> + * Events' information is then extracted from the SCMI Notification messages and
> + * conveyed, converted into a custom per-event report struct, as the void *data
> + * param to the user callback provided by the registered notifier_block, so that
> + * from the user perspective his callback will look invoked like:
> + *
> + * int user_cb(struct notifier_block *nb, unsigned long event_id, void *report)
> + *
>   */
>  
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> @@ -64,6 +85,7 @@
>  #include <linux/scmi_protocol.h>
>  #include <linux/slab.h>
>  #include <linux/types.h>
> +#include <linux/workqueue.h>
>  
>  #include "notify.h"
>  
> @@ -143,6 +165,8 @@
>  #define REVT_NOTIFY_ENABLE(revt, ...)	\
>  	((revt)->proto->ops->set_notify_enabled((revt)->proto->ni->handle,     \
>  						__VA_ARGS__))
> +#define REVT_FILL_REPORT(revt, ...)	\
> +	((revt)->proto->ops->fill_custom_report(__VA_ARGS__))
>  
>  struct scmi_registered_protocol_events_desc;
>  
> @@ -158,6 +182,7 @@ struct scmi_registered_protocol_events_desc;
>   *		 and protocols are allowed to register their supported events
>   * @enabled: A flag to indicate events can be enabled and start flowing
>   * @init_work: A work item to perform final initializations of pending handlers
> + * @notify_wq: A reference to the allocated Kernel cmwq
>   * @pending_mtx: A mutex to protect @pending_events_handlers
>   * @registered_protocols: An statically allocated array containing pointers to
>   *			  all the registered protocol-level specific information
> @@ -173,6 +198,8 @@ struct scmi_notify_instance {
>  
>  	struct work_struct				init_work;
>  
> +	struct workqueue_struct				*notify_wq;
> +
>  	struct mutex					pending_mtx;
>  	struct scmi_registered_protocol_events_desc	**registered_protocols;
>  	DECLARE_HASHTABLE(pending_events_handlers, 8);
> @@ -186,11 +213,15 @@ struct scmi_notify_instance {
>   * @sz: Size in bytes of the related kfifo
>   * @qbuf: Pre-allocated buffer of @sz bytes to be used by the kfifo
>   * @kfifo: A dedicated Kernel kfifo descriptor
> + * @notify_work: A custom work item bound to this queue
> + * @wq: A reference to the associated workqueue
>   */
>  struct events_queue {
>  	size_t				sz;
>  	u8				*qbuf;
>  	struct kfifo			kfifo;
> +	struct work_struct		notify_work;
> +	struct workqueue_struct		*wq;
>  };
>  
>  /**
> @@ -316,8 +347,249 @@ struct scmi_event_handler {
>  
>  #define IS_HNDL_PENDING(hndl)	((hndl)->r_evt == NULL)
>  
> +static struct scmi_event_handler *
> +scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key);
> +static void scmi_put_active_handler(struct scmi_notify_instance *ni,
> +				    struct scmi_event_handler *hndl);
>  static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
>  				      struct scmi_event_handler *hndl);
> +
> +/**
> + * scmi_lookup_and_call_event_chain  - Lookup the proper chain and call it
> + *
> + * @ni: A reference to the notification instance to use
> + * @evt_key: The key to use to lookup the related notification chain
> + * @report: The customized event-specific report to pass down to the callbacks
> + *	    as their *data parameter.
> + */
> +static inline void
> +scmi_lookup_and_call_event_chain(struct scmi_notify_instance *ni,
> +				 u32 evt_key, void *report)
> +{
> +	int ret;
> +	struct scmi_event_handler *hndl;
> +
> +	/* Here ensure the event handler cannot vanish while using it */
> +	hndl = scmi_get_active_handler(ni, evt_key);
> +	if (IS_ERR_OR_NULL(hndl))
> +		return;
> +
> +	ret = blocking_notifier_call_chain(&hndl->chain,
> +					   KEY_XTRACT_EVT_ID(evt_key),
> +					   report);
> +	/* Notifiers are NOT supposed to cut the chain ... */
> +	WARN_ON_ONCE(ret & NOTIFY_STOP_MASK);
> +
> +	scmi_put_active_handler(ni, hndl);
> +}
> +
> +/**
> + * scmi_process_event_header  - Dequeue and process an event header
> + *
> + * Read an event header from the protocol queue into the dedicated scratch
> + * buffer and looks for a matching registered event; in case an anomalously
> + * sized read is detected just flush the queue.
> + *
> + * @eq: The queue to use
> + * @pd: The protocol descriptor to use
> + *
> + * Returns:
> + *  - a reference to the matching registered event when found
> + *  - ERR_PTR(-EINVAL) when NO registered event could be found
> + *  - NULL when the queue is empty
> + */
> +static inline struct scmi_registered_event *
> +scmi_process_event_header(struct events_queue *eq,
> +			  struct scmi_registered_protocol_events_desc *pd)
> +{
> +	unsigned int outs;
> +	struct scmi_registered_event *r_evt;
> +
> +	outs = kfifo_out(&eq->kfifo, pd->eh,
> +			 sizeof(struct scmi_event_header));
> +	if (!outs)
> +		return NULL;
> +	if (outs != sizeof(struct scmi_event_header)) {
> +		pr_err("SCMI Notifications: corrupted EVT header. Flush.\n");
> +		kfifo_reset_out(&eq->kfifo);
> +		return NULL;
> +	}
> +
> +	r_evt = SCMI_GET_REVT_FROM_PD(pd, pd->eh->evt_id);
> +	if (!r_evt)
> +		r_evt = ERR_PTR(-EINVAL);
> +
> +	return r_evt;
> +}
> +
> +/**
> + * scmi_process_event_payload  - Dequeue and process an event payload
> + *
> + * Read an event payload from the protocol queue into the dedicated scratch
> + * buffer, fills a custom report and then look for matching event handlers and
> + * call them; skip any unknown event (as marked by scmi_process_event_header())
> + * and in case an anomalously sized read is detected just flush the queue.
> + *
> + * @eq: The queue to use
> + * @pd: The protocol descriptor to use
> + * @r_evt: The registered event descriptor to use
> + *
> + * Return: False when the queue is empty
> + */
> +static inline bool
> +scmi_process_event_payload(struct events_queue *eq,
> +			   struct scmi_registered_protocol_events_desc *pd,
> +			   struct scmi_registered_event *r_evt)
> +{
> +	u32 src_id, key;
> +	unsigned int outs;
> +	void *report = NULL;
> +
> +	outs = kfifo_out(&eq->kfifo, pd->eh->payld, pd->eh->payld_sz);
> +	if (unlikely(!outs))
> +		return false;
> +
> +	/* Any in-flight event has now been officially processed */
> +	pd->in_flight = NULL;
> +
> +	if (unlikely(outs != pd->eh->payld_sz)) {
> +		pr_err("SCMI Notifications: corrupted EVT Payload. Flush.\n");
> +		kfifo_reset_out(&eq->kfifo);
> +		return false;
> +	}
> +
> +	if (IS_ERR(r_evt)) {
> +		pr_warn("SCMI Notifications: SKIP UNKNOWN EVT - proto:%X  evt:%d\n",
> +			pd->id, pd->eh->evt_id);
> +		return true;
> +	}
> +
> +	report = REVT_FILL_REPORT(r_evt, pd->eh->evt_id, pd->eh->timestamp,
> +				  pd->eh->payld, pd->eh->payld_sz,
> +				  r_evt->report, &src_id);
> +	if (!report) {
> +		pr_err("SCMI Notifications: Report not available - proto:%X  evt:%d\n",
> +		       pd->id, pd->eh->evt_id);
> +		return true;
> +	}
> +
> +	/* At first search for a generic ALL src_ids handler... */
> +	key = MAKE_ALL_SRCS_KEY(pd->id, pd->eh->evt_id);
> +	scmi_lookup_and_call_event_chain(pd->ni, key, report);
> +
> +	/* ...then search for any specific src_id */
> +	key = MAKE_HASH_KEY(pd->id, pd->eh->evt_id, src_id);
> +	scmi_lookup_and_call_event_chain(pd->ni, key, report);
> +
> +	return true;
> +}
> +
> +/**
> + * scmi_events_dispatcher  - Common worker logic for all work items.
> + *
> + *  1. dequeue one pending RX notification (queued in SCMI RX ISR context)
> + *  2. generate a custom event report from the received event message
> + *  3. lookup for any registered ALL_SRC_IDs handler
> + *     - > call the related notification chain passing in the report
> + *  4. lookup for any registered specific SRC_ID handler
> + *     - > call the related notification chain passing in the report
> + *
> + * Note that:
> + * - a dedicated per-protocol kfifo queue is used: in this way an anomalous
> + *   flood of events cannot saturate other protocols' queues.
> + *
> + * - each per-protocol queue is associated to a distinct work_item, which
> + *   means, in turn, that:
> + *   + all protocols can process their dedicated queues concurrently
> + *     (since notify_wq:max_active != 1)
> + *   + anyway at most one worker instance is allowed to run on the same queue
> + *     concurrently: this ensures that we can have only one concurrent
> + *     reader/writer on the associated kfifo, so that we can use it lock-less
> + *
> + * @work: The work item to use, which is associated to a dedicated events_queue
> + */
> +static void scmi_events_dispatcher(struct work_struct *work)
> +{
> +	struct events_queue *eq;
> +	struct scmi_registered_protocol_events_desc *pd;
> +	struct scmi_registered_event *r_evt;
> +
> +	eq = container_of(work, struct events_queue, notify_work);
> +	pd = container_of(eq, struct scmi_registered_protocol_events_desc,
> +			  equeue);
> +	/*
> +	 * In order to keep the queue lock-less and the number of memcopies
> +	 * to the bare minimum needed, the dispatcher accounts for the
> +	 * possibility of per-protocol in-flight events: i.e. an event whose
> +	 * reception could end up being split across two subsequent runs of this
> +	 * worker, first the header, then the payload.
> +	 */
> +	do {
> +		if (likely(!pd->in_flight)) {
> +			r_evt = scmi_process_event_header(eq, pd);
> +			if (!r_evt)
> +				break;
> +			pd->in_flight = r_evt;
> +		} else {
> +			r_evt = pd->in_flight;
> +		}
> +	} while (scmi_process_event_payload(eq, pd, r_evt));
> +}
> +
> +/**
> + * scmi_notify  - Queues a notification for further deferred processing
> + *
> + * This is called in interrupt context to queue a received event for
> + * deferred processing.
> + *
> + * @handle: The handle identifying the platform instance from which the
> + *	    dispatched event is generated
> + * @proto_id: Protocol ID
> + * @evt_id: Event ID (msgID)
> + * @buf: Event Message Payload (without the header)
> + * @len: Event Message Payload size
> + * @ts: RX Timestamp in nanoseconds (boottime)
> + *
> + * Return: 0 on Success
> + */
> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
> +		const void *buf, size_t len, u64 ts)
> +{
> +	struct scmi_registered_event *r_evt;
> +	struct scmi_event_header eh;
> +	struct scmi_notify_instance *ni = handle->notify_priv;
> +
> +	/* Ensure atomic value is updated */
> +	smp_mb__before_atomic();
> +	if (unlikely(!atomic_read(&ni->enabled)))
> +		return 0;
> +
> +	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
> +	if (unlikely(!r_evt))
> +		return -EINVAL;
> +
> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
> +		pr_err("SCMI Notifications: discard badly sized message\n");
> +		return -EINVAL;
> +	}
> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
> +		     sizeof(eh) + len)) {
> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
> +			proto_id, evt_id, ts);
> +		return -ENOMEM;
> +	}
> +
> +	eh.timestamp = ts;
> +	eh.evt_id = evt_id;
> +	eh.payld_sz = len;
> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));

I'd add a comment that this potential race here is the reason (I think) for all
the inflight handling above.

Either that or create a kfifo_in_pair_unsafe that just makes these atomic by only
updating the kfifo->in point after adding both parts.

It will be as simple as (I think, kfifo magic always give me a headache).
{
	struct __kfifo *__kfifo = &kfifo->kfifo;
	kfifo_copy_in(fifo, &eh, sizeof(eh), fifo->in);
	kfifo_copy_in(fifo, &buf, len, fifo->in + sizeof(eh));
	fifo->in += len + sizeof(eh);
}

It's unsafe because crazy things will happen if there isn't enough room, but you
can't get there in this code because of the check above and we are making
horrendous assumptions about the kfifo type.

> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> +	queue_work(r_evt->proto->equeue.wq,
> +		   &r_evt->proto->equeue.notify_work);
> +
> +	return 0;
> +}
> +
>  /**
>   * scmi_initialize_events_queue  - Allocate/Initialize a kfifo buffer
>   *
> @@ -332,12 +604,21 @@ static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
>  static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
>  					struct events_queue *equeue, size_t sz)
>  {
> +	int ret = 0;

ret looks to be always initialized below.

> +
>  	equeue->qbuf = devm_kzalloc(ni->handle->dev, sz, GFP_KERNEL);
>  	if (!equeue->qbuf)
>  		return -ENOMEM;
>  	equeue->sz = sz;
>  
> -	return kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
> +	ret = kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
> +	if (ret)
> +		return ret;
> +
> +	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
> +	equeue->wq = ni->notify_wq;
> +
> +	return ret;
>  }
>  
>  /**
> @@ -740,6 +1021,38 @@ scmi_get_or_create_handler(struct scmi_notify_instance *ni, u32 evt_key)
>  	return __scmi_event_handler_get_ops(ni, evt_key, true);
>  }
>  
> +/**
> + * scmi_get_active_handler  - Helper to get active handlers only
> + *
> + * Search for the desired handler matching the key only in the per-protocol
> + * table of registered handlers: this is called only from the dispatching path
> + * so want to be as quick as possible and do not care about pending.
> + *
> + * @ni: A reference to the notification instance to use
> + * @evt_key: The event key to use
> + *
> + * Return: A properly refcounted active handler
> + */
> +static struct scmi_event_handler *
> +scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key)
> +{
> +	struct scmi_registered_event *r_evt;
> +	struct scmi_event_handler *hndl = NULL;
> +
> +	r_evt = SCMI_GET_REVT(ni, KEY_XTRACT_PROTO_ID(evt_key),
> +			      KEY_XTRACT_EVT_ID(evt_key));
> +	if (likely(r_evt)) {
> +		mutex_lock(&r_evt->proto->registered_mtx);
> +		hndl = KEY_FIND(r_evt->proto->registered_events_handlers,
> +				hndl, evt_key);
> +		if (likely(hndl))
> +			refcount_inc(&hndl->users);
> +		mutex_unlock(&r_evt->proto->registered_mtx);
> +	}
> +
> +	return hndl;
> +}
> +
>  /**
>   * __scmi_enable_evt  - Enable/disable events generation
>   *
> @@ -861,6 +1174,16 @@ static void scmi_put_handler(struct scmi_notify_instance *ni,
>  	mutex_unlock(&ni->pending_mtx);
>  }
>  
> +static void scmi_put_active_handler(struct scmi_notify_instance *ni,
> +					  struct scmi_event_handler *hndl)
> +{
> +	struct scmi_registered_event *r_evt = hndl->r_evt;
> +
> +	mutex_lock(&r_evt->proto->registered_mtx);
> +	scmi_put_handler_unlocked(ni, hndl);
> +	mutex_unlock(&r_evt->proto->registered_mtx);
> +}
> +
>  /**
>   * scmi_event_handler_enable_events  - Enable events associated to an handler
>   *
> @@ -1087,6 +1410,12 @@ int scmi_notification_init(struct scmi_handle *handle)
>  	ni->gid = gid;
>  	ni->handle = handle;
>  
> +	ni->notify_wq = alloc_workqueue("scmi_notify",
> +					WQ_UNBOUND | WQ_FREEZABLE | WQ_SYSFS,
> +					0);
> +	if (!ni->notify_wq)
> +		goto err;
> +
>  	ni->registered_protocols = devm_kcalloc(handle->dev, SCMI_MAX_PROTO,
>  						sizeof(char *), GFP_KERNEL);
>  	if (!ni->registered_protocols)
> @@ -1133,6 +1462,9 @@ void scmi_notification_exit(struct scmi_handle *handle)
>  	/* Ensure atomic values are updated */
>  	smp_mb__after_atomic();
>  
> +	/* Destroy while letting pending work complete */
> +	destroy_workqueue(ni->notify_wq);
> +
>  	devres_release_group(ni->handle->dev, ni->gid);
>  
>  	pr_info("SCMI Notifications Core Shutdown.\n");
> diff --git a/drivers/firmware/arm_scmi/notify.h b/drivers/firmware/arm_scmi/notify.h
> index f765acda2311..6cd386649d5a 100644
> --- a/drivers/firmware/arm_scmi/notify.h
> +++ b/drivers/firmware/arm_scmi/notify.h
> @@ -51,10 +51,17 @@ struct scmi_event {
>   *			using the proper custom protocol commands.
>   *			Return true if at least one the required src_id
>   *			has been successfully enabled/disabled
> + * @fill_custom_report: fills a custom event report from the provided
> + *			event message payld identifying the event
> + *			specific src_id.
> + *			Return NULL on failure otherwise @report now fully
> + *			populated
>   */
>  struct scmi_protocol_event_ops {
>  	bool (*set_notify_enabled)(const struct scmi_handle *handle,
>  				   u8 evt_id, u32 src_id, bool enabled);
> +	void *(*fill_custom_report)(u8 evt_id, u64 timestamp, const void *payld,
> +				    size_t payld_sz, void *report, u32 *src_id);
>  };
>  
>  int scmi_notification_init(struct scmi_handle *handle);
> @@ -65,5 +72,7 @@ int scmi_register_protocol_events(const struct scmi_handle *handle,
>  				  const struct scmi_protocol_event_ops *ops,
>  				  const struct scmi_event *evt, int num_events,
>  				  int num_sources);
> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
> +		const void *buf, size_t len, u64 ts);
>  
>  #endif /* _SCMI_NOTIFY_H */
Cristian Marussi March 9, 2020, 4:37 p.m. UTC | #2
Hi

On 09/03/2020 12:26, Jonathan Cameron wrote:
> On Wed, 4 Mar 2020 16:25:52 +0000
> Cristian Marussi <cristian.marussi@arm.com> wrote:
> 
>> Add core SCMI Notifications dispatch and delivery support logic which is
>> able, at first, to dispatch well-known received events from the RX ISR to
>> the dedicated deferred worker, and then, from there, to final deliver the
>> events to the registered users' callbacks.
>>
>> Dispatch and delivery is just added here, still not enabled.
>>
>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
> 
> Hmm.  Doing that magic in_flight stuff looks fine, but it feels like
> the wrong way to approach a problem which is down to the lack of
> atomicity of the kfifo_in pair.   Could we just make that atomic via
> a bit of custom manipulation of the kfifo?
> 
> The snag is that stuff isn't exported from the innards of kfifo...

My initial approach till v3 was to collate header and payload in a pre-allocated
scratch buffer and then doing a single kfifo_in so as to avoid to worry about workqueue
empting the kfifo and going to sleep right after a header has been read and a payload is
in flight, but that, as pointed out indirectly by Jim Quinlan led to an unneeded memcpy...
in fact I was copying in/out the fifo a total of 2*h + 3*p bytes, instead with this handling
I can avoid such intermediate collation step and stick to the bare minimum needed 2*h + 2*p
bytes memcopies.

On one side I was worried to make the code complex to avoid just a few bytes of memcpy, on the other
side the redundant memcpy is on the ISR side and also I cannot assume that the unneded p bytes
copied there are necessarily small ... being SMCI extensible you could possibly add a proprietary
(or not) protocol with jumbo payloads of KBs so that the p-bytes redundant copy is no more so
negligible.

At the end I did not find so horrible and complex the new in flight handling (tested introducing
horrible mdelays in between the kfifo_inS inside the ISR...), so I went for that.

> 
> Maybe what you have here is the best option.
> 

I like the solution you propose down below, but the fact that it relies on the inner kfifo function
is in fact a show stopper being based on the inernal api (and I have not found other viable ways to
abuse the kfifo API :D ... as of now)...I wonder if it is not worth propose upstream (not in this series)
a generic kfifo "light" scatter/gather in/out interface for this particular usecase; _kfifo_dma* seem to use
the full fledged scatter/gather kernel structs, but that's certainly overkill for this scenario.

Thanks

Cristian

> Jonathan
> 
>> ---
>> V3 --> V4
>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>   handling of these in_flight events let us remove one unneeded memcpy
>>   on RX interrupt path (scmi_notify)
>> - deferred dispatcher now access their own per-protocol handlers' table
>>   reducing locking contention on the RX path
>> V2 --> V3
>> - exposing wq in sysfs via WQ_SYSFS
>> V1 --> V2
>> - splitted out of V1 patch 04
>> - moved from IDR maps to real HashTables to store event_handlers
>> - simplified delivery logic
>> ---
>>  drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>  drivers/firmware/arm_scmi/notify.h |   9 +
>>  2 files changed, 342 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
>> index d6c08cce3c63..0854d48d5886 100644
>> --- a/drivers/firmware/arm_scmi/notify.c
>> +++ b/drivers/firmware/arm_scmi/notify.c
>> @@ -44,6 +44,27 @@
>>   * as described in the SCMI Protocol specification, while src_id represents an
>>   * optional, protocol dependent, source identifier (like domain_id, perf_id
>>   * or sensor_id and so forth).
>> + *
>> + * Upon reception of a notification message from the platform the SCMI RX ISR
[snip]

>> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
>> +		pr_err("SCMI Notifications: discard badly sized message\n");
>> +		return -EINVAL;
>> +	}
>> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>> +		     sizeof(eh) + len)) {
>> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
>> +			proto_id, evt_id, ts);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	eh.timestamp = ts;
>> +	eh.evt_id = evt_id;
>> +	eh.payld_sz = len;
>> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
> 
> I'd add a comment that this potential race here is the reason (I think) for all
> the inflight handling above.
> 
> Either that or create a kfifo_in_pair_unsafe that just makes these atomic by only
> updating the kfifo->in point after adding both parts.
> 
> It will be as simple as (I think, kfifo magic always give me a headache).
> {
> 	struct __kfifo *__kfifo = &kfifo->kfifo;
> 	kfifo_copy_in(fifo, &eh, sizeof(eh), fifo->in);
> 	kfifo_copy_in(fifo, &buf, len, fifo->in + sizeof(eh));
> 	fifo->in += len + sizeof(eh);
> }
> 
> It's unsafe because crazy things will happen if there isn't enough room, but you
> can't get there in this code because of the check above and we are making
> horrendous assumptions about the kfifo type.
> 

As said above.
>> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>> +	queue_work(r_evt->proto->equeue.wq,
>> +		   &r_evt->proto->equeue.notify_work);
>> +
>> +	return 0;
>> +}
>> +
>>  /**
>>   * scmi_initialize_events_queue  - Allocate/Initialize a kfifo buffer
>>   *
>> @@ -332,12 +604,21 @@ static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
>>  static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
>>  					struct events_queue *equeue, size_t sz)
>>  {
>> +	int ret = 0;
> 
> ret looks to be always initialized below.
> 

Right.
>> +
>>  	equeue->qbuf = devm_kzalloc(ni->handle->dev, sz, GFP_KERNEL);
>>  	if (!equeue->qbuf)
>>  		return -ENOMEM;
>>  	equeue->sz = sz;
>>  
>> -	return kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
>> +	ret = kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
>> +	if (ret)
>> +		return ret;
>> +
>> +	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
>> +	equeue->wq = ni->notify_wq;
>> +
>> +	return ret;
>>  }
>>  
>>  /**
>> @@ -740,6 +1021,38 @@ scmi_get_or_create_handler(struct scmi_notify_instance *ni, u32 evt_key)
>>  	return __scmi_event_handler_get_ops(ni, evt_key, true);
>>  }
>>  
>> +/**
>> + * scmi_get_active_handler  - Helper to get active handlers only
>> + *
>> + * Search for the desired handler matching the key only in the per-protocol
>> + * table of registered handlers: this is called only from the dispatching path
>> + * so want to be as quick as possible and do not care about pending.
>> + *
>> + * @ni: A reference to the notification instance to use
>> + * @evt_key: The event key to use
>> + *
>> + * Return: A properly refcounted active handler
>> + */
>> +static struct scmi_event_handler *
>> +scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key)
>> +{
>> +	struct scmi_registered_event *r_evt;
>> +	struct scmi_event_handler *hndl = NULL;
>> +
>> +	r_evt = SCMI_GET_REVT(ni, KEY_XTRACT_PROTO_ID(evt_key),
>> +			      KEY_XTRACT_EVT_ID(evt_key));
>> +	if (likely(r_evt)) {
>> +		mutex_lock(&r_evt->proto->registered_mtx);
>> +		hndl = KEY_FIND(r_evt->proto->registered_events_handlers,
>> +				hndl, evt_key);
>> +		if (likely(hndl))
>> +			refcount_inc(&hndl->users);
>> +		mutex_unlock(&r_evt->proto->registered_mtx);
>> +	}
>> +
>> +	return hndl;
>> +}
>> +
>>  /**
>>   * __scmi_enable_evt  - Enable/disable events generation
>>   *
>> @@ -861,6 +1174,16 @@ static void scmi_put_handler(struct scmi_notify_instance *ni,
>>  	mutex_unlock(&ni->pending_mtx);
>>  }
>>  
>> +static void scmi_put_active_handler(struct scmi_notify_instance *ni,
>> +					  struct scmi_event_handler *hndl)
>> +{
>> +	struct scmi_registered_event *r_evt = hndl->r_evt;
>> +
>> +	mutex_lock(&r_evt->proto->registered_mtx);
>> +	scmi_put_handler_unlocked(ni, hndl);
>> +	mutex_unlock(&r_evt->proto->registered_mtx);
>> +}
>> +
>>  /**
>>   * scmi_event_handler_enable_events  - Enable events associated to an handler
>>   *
>> @@ -1087,6 +1410,12 @@ int scmi_notification_init(struct scmi_handle *handle)
>>  	ni->gid = gid;
>>  	ni->handle = handle;
>>  
>> +	ni->notify_wq = alloc_workqueue("scmi_notify",
>> +					WQ_UNBOUND | WQ_FREEZABLE | WQ_SYSFS,
>> +					0);
>> +	if (!ni->notify_wq)
>> +		goto err;
>> +
>>  	ni->registered_protocols = devm_kcalloc(handle->dev, SCMI_MAX_PROTO,
>>  						sizeof(char *), GFP_KERNEL);
>>  	if (!ni->registered_protocols)
>> @@ -1133,6 +1462,9 @@ void scmi_notification_exit(struct scmi_handle *handle)
>>  	/* Ensure atomic values are updated */
>>  	smp_mb__after_atomic();
>>  
>> +	/* Destroy while letting pending work complete */
>> +	destroy_workqueue(ni->notify_wq);
>> +
>>  	devres_release_group(ni->handle->dev, ni->gid);
>>  
>>  	pr_info("SCMI Notifications Core Shutdown.\n");
>> diff --git a/drivers/firmware/arm_scmi/notify.h b/drivers/firmware/arm_scmi/notify.h
>> index f765acda2311..6cd386649d5a 100644
>> --- a/drivers/firmware/arm_scmi/notify.h
>> +++ b/drivers/firmware/arm_scmi/notify.h
>> @@ -51,10 +51,17 @@ struct scmi_event {
>>   *			using the proper custom protocol commands.
>>   *			Return true if at least one the required src_id
>>   *			has been successfully enabled/disabled
>> + * @fill_custom_report: fills a custom event report from the provided
>> + *			event message payld identifying the event
>> + *			specific src_id.
>> + *			Return NULL on failure otherwise @report now fully
>> + *			populated
>>   */
>>  struct scmi_protocol_event_ops {
>>  	bool (*set_notify_enabled)(const struct scmi_handle *handle,
>>  				   u8 evt_id, u32 src_id, bool enabled);
>> +	void *(*fill_custom_report)(u8 evt_id, u64 timestamp, const void *payld,
>> +				    size_t payld_sz, void *report, u32 *src_id);
>>  };
>>  
>>  int scmi_notification_init(struct scmi_handle *handle);
>> @@ -65,5 +72,7 @@ int scmi_register_protocol_events(const struct scmi_handle *handle,
>>  				  const struct scmi_protocol_event_ops *ops,
>>  				  const struct scmi_event *evt, int num_events,
>>  				  int num_sources);
>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
>> +		const void *buf, size_t len, u64 ts);
>>  
>>  #endif /* _SCMI_NOTIFY_H */
> 
>
Jonathan Cameron March 10, 2020, 10:01 a.m. UTC | #3
On Mon, 9 Mar 2020 16:37:53 +0000
Cristian Marussi <cristian.marussi@arm.com> wrote:

> Hi
> 
> On 09/03/2020 12:26, Jonathan Cameron wrote:
> > On Wed, 4 Mar 2020 16:25:52 +0000
> > Cristian Marussi <cristian.marussi@arm.com> wrote:
> >   
> >> Add core SCMI Notifications dispatch and delivery support logic which is
> >> able, at first, to dispatch well-known received events from the RX ISR to
> >> the dedicated deferred worker, and then, from there, to final deliver the
> >> events to the registered users' callbacks.
> >>
> >> Dispatch and delivery is just added here, still not enabled.
> >>
> >> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>  
> > 
> > Hmm.  Doing that magic in_flight stuff looks fine, but it feels like
> > the wrong way to approach a problem which is down to the lack of
> > atomicity of the kfifo_in pair.   Could we just make that atomic via
> > a bit of custom manipulation of the kfifo?
> > 
> > The snag is that stuff isn't exported from the innards of kfifo...  
> 
> My initial approach till v3 was to collate header and payload in a pre-allocated
> scratch buffer and then doing a single kfifo_in so as to avoid to worry about workqueue
> empting the kfifo and going to sleep right after a header has been read and a payload is
> in flight, but that, as pointed out indirectly by Jim Quinlan led to an unneeded memcpy...
> in fact I was copying in/out the fifo a total of 2*h + 3*p bytes, instead with this handling
> I can avoid such intermediate collation step and stick to the bare minimum needed 2*h + 2*p
> bytes memcopies.
> 
> On one side I was worried to make the code complex to avoid just a few bytes of memcpy, on the other
> side the redundant memcpy is on the ISR side and also I cannot assume that the unneded p bytes
> copied there are necessarily small ... being SMCI extensible you could possibly add a proprietary
> (or not) protocol with jumbo payloads of KBs so that the p-bytes redundant copy is no more so
> negligible.
> 
> At the end I did not find so horrible and complex the new in flight handling (tested introducing
> horrible mdelays in between the kfifo_inS inside the ISR...), so I went for that.
> 
> > 
> > Maybe what you have here is the best option.
> >   
> 
> I like the solution you propose down below, but the fact that it relies on the inner kfifo function
> is in fact a show stopper being based on the inernal api (and I have not found other viable ways to
> abuse the kfifo API :D ... as of now)...I wonder if it is not worth propose upstream (not in this series)
> a generic kfifo "light" scatter/gather in/out interface for this particular usecase; _kfifo_dma* seem to use
> the full fledged scatter/gather kernel structs, but that's certainly overkill for this scenario.

Seems sensible to me.  Whether via scatterlists is needed, I'm not sure.
Seems to me that most usecases will be header / payload like you have here.

The fiddly stuff in kfifo is always dealing with handling both variable and fixed
size versions.  Here we probably just want to reject the fixed size option at
compile time as it doesn't make sense.

Certainly worth exploring if the kfifo maintainers will allow this sort of interface.

Jonathan

> 
> Thanks
> 
> Cristian
> 
> > Jonathan
> >   
> >> ---
> >> V3 --> V4
> >> - dispatcher now handles dequeuing of events in chunks (header+payload):
> >>   handling of these in_flight events let us remove one unneeded memcpy
> >>   on RX interrupt path (scmi_notify)
> >> - deferred dispatcher now access their own per-protocol handlers' table
> >>   reducing locking contention on the RX path
> >> V2 --> V3
> >> - exposing wq in sysfs via WQ_SYSFS
> >> V1 --> V2
> >> - splitted out of V1 patch 04
> >> - moved from IDR maps to real HashTables to store event_handlers
> >> - simplified delivery logic
> >> ---
> >>  drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
> >>  drivers/firmware/arm_scmi/notify.h |   9 +
> >>  2 files changed, 342 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
> >> index d6c08cce3c63..0854d48d5886 100644
> >> --- a/drivers/firmware/arm_scmi/notify.c
> >> +++ b/drivers/firmware/arm_scmi/notify.c
> >> @@ -44,6 +44,27 @@
> >>   * as described in the SCMI Protocol specification, while src_id represents an
> >>   * optional, protocol dependent, source identifier (like domain_id, perf_id
> >>   * or sensor_id and so forth).
> >> + *
> >> + * Upon reception of a notification message from the platform the SCMI RX ISR  
> [snip]
> 
> >> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
> >> +		pr_err("SCMI Notifications: discard badly sized message\n");
> >> +		return -EINVAL;
> >> +	}
> >> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
> >> +		     sizeof(eh) + len)) {
> >> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
> >> +			proto_id, evt_id, ts);
> >> +		return -ENOMEM;
> >> +	}
> >> +
> >> +	eh.timestamp = ts;
> >> +	eh.evt_id = evt_id;
> >> +	eh.payld_sz = len;
> >> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));  
> > 
> > I'd add a comment that this potential race here is the reason (I think) for all
> > the inflight handling above.
> > 
> > Either that or create a kfifo_in_pair_unsafe that just makes these atomic by only
> > updating the kfifo->in point after adding both parts.
> > 
> > It will be as simple as (I think, kfifo magic always give me a headache).
> > {
> > 	struct __kfifo *__kfifo = &kfifo->kfifo;
> > 	kfifo_copy_in(fifo, &eh, sizeof(eh), fifo->in);
> > 	kfifo_copy_in(fifo, &buf, len, fifo->in + sizeof(eh));
> > 	fifo->in += len + sizeof(eh);
> > }
> > 
> > It's unsafe because crazy things will happen if there isn't enough room, but you
> > can't get there in this code because of the check above and we are making
> > horrendous assumptions about the kfifo type.
> >   
> 
> As said above.
> >> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> >> +	queue_work(r_evt->proto->equeue.wq,
> >> +		   &r_evt->proto->equeue.notify_work);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  /**
> >>   * scmi_initialize_events_queue  - Allocate/Initialize a kfifo buffer
> >>   *
> >> @@ -332,12 +604,21 @@ static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
> >>  static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
> >>  					struct events_queue *equeue, size_t sz)
> >>  {
> >> +	int ret = 0;  
> > 
> > ret looks to be always initialized below.
> >   
> 
> Right.
> >> +
> >>  	equeue->qbuf = devm_kzalloc(ni->handle->dev, sz, GFP_KERNEL);
> >>  	if (!equeue->qbuf)
> >>  		return -ENOMEM;
> >>  	equeue->sz = sz;
> >>  
> >> -	return kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
> >> +	ret = kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
> >> +	equeue->wq = ni->notify_wq;
> >> +
> >> +	return ret;
> >>  }
> >>  
> >>  /**
> >> @@ -740,6 +1021,38 @@ scmi_get_or_create_handler(struct scmi_notify_instance *ni, u32 evt_key)
> >>  	return __scmi_event_handler_get_ops(ni, evt_key, true);
> >>  }
> >>  
> >> +/**
> >> + * scmi_get_active_handler  - Helper to get active handlers only
> >> + *
> >> + * Search for the desired handler matching the key only in the per-protocol
> >> + * table of registered handlers: this is called only from the dispatching path
> >> + * so want to be as quick as possible and do not care about pending.
> >> + *
> >> + * @ni: A reference to the notification instance to use
> >> + * @evt_key: The event key to use
> >> + *
> >> + * Return: A properly refcounted active handler
> >> + */
> >> +static struct scmi_event_handler *
> >> +scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key)
> >> +{
> >> +	struct scmi_registered_event *r_evt;
> >> +	struct scmi_event_handler *hndl = NULL;
> >> +
> >> +	r_evt = SCMI_GET_REVT(ni, KEY_XTRACT_PROTO_ID(evt_key),
> >> +			      KEY_XTRACT_EVT_ID(evt_key));
> >> +	if (likely(r_evt)) {
> >> +		mutex_lock(&r_evt->proto->registered_mtx);
> >> +		hndl = KEY_FIND(r_evt->proto->registered_events_handlers,
> >> +				hndl, evt_key);
> >> +		if (likely(hndl))
> >> +			refcount_inc(&hndl->users);
> >> +		mutex_unlock(&r_evt->proto->registered_mtx);
> >> +	}
> >> +
> >> +	return hndl;
> >> +}
> >> +
> >>  /**
> >>   * __scmi_enable_evt  - Enable/disable events generation
> >>   *
> >> @@ -861,6 +1174,16 @@ static void scmi_put_handler(struct scmi_notify_instance *ni,
> >>  	mutex_unlock(&ni->pending_mtx);
> >>  }
> >>  
> >> +static void scmi_put_active_handler(struct scmi_notify_instance *ni,
> >> +					  struct scmi_event_handler *hndl)
> >> +{
> >> +	struct scmi_registered_event *r_evt = hndl->r_evt;
> >> +
> >> +	mutex_lock(&r_evt->proto->registered_mtx);
> >> +	scmi_put_handler_unlocked(ni, hndl);
> >> +	mutex_unlock(&r_evt->proto->registered_mtx);
> >> +}
> >> +
> >>  /**
> >>   * scmi_event_handler_enable_events  - Enable events associated to an handler
> >>   *
> >> @@ -1087,6 +1410,12 @@ int scmi_notification_init(struct scmi_handle *handle)
> >>  	ni->gid = gid;
> >>  	ni->handle = handle;
> >>  
> >> +	ni->notify_wq = alloc_workqueue("scmi_notify",
> >> +					WQ_UNBOUND | WQ_FREEZABLE | WQ_SYSFS,
> >> +					0);
> >> +	if (!ni->notify_wq)
> >> +		goto err;
> >> +
> >>  	ni->registered_protocols = devm_kcalloc(handle->dev, SCMI_MAX_PROTO,
> >>  						sizeof(char *), GFP_KERNEL);
> >>  	if (!ni->registered_protocols)
> >> @@ -1133,6 +1462,9 @@ void scmi_notification_exit(struct scmi_handle *handle)
> >>  	/* Ensure atomic values are updated */
> >>  	smp_mb__after_atomic();
> >>  
> >> +	/* Destroy while letting pending work complete */
> >> +	destroy_workqueue(ni->notify_wq);
> >> +
> >>  	devres_release_group(ni->handle->dev, ni->gid);
> >>  
> >>  	pr_info("SCMI Notifications Core Shutdown.\n");
> >> diff --git a/drivers/firmware/arm_scmi/notify.h b/drivers/firmware/arm_scmi/notify.h
> >> index f765acda2311..6cd386649d5a 100644
> >> --- a/drivers/firmware/arm_scmi/notify.h
> >> +++ b/drivers/firmware/arm_scmi/notify.h
> >> @@ -51,10 +51,17 @@ struct scmi_event {
> >>   *			using the proper custom protocol commands.
> >>   *			Return true if at least one the required src_id
> >>   *			has been successfully enabled/disabled
> >> + * @fill_custom_report: fills a custom event report from the provided
> >> + *			event message payld identifying the event
> >> + *			specific src_id.
> >> + *			Return NULL on failure otherwise @report now fully
> >> + *			populated
> >>   */
> >>  struct scmi_protocol_event_ops {
> >>  	bool (*set_notify_enabled)(const struct scmi_handle *handle,
> >>  				   u8 evt_id, u32 src_id, bool enabled);
> >> +	void *(*fill_custom_report)(u8 evt_id, u64 timestamp, const void *payld,
> >> +				    size_t payld_sz, void *report, u32 *src_id);
> >>  };
> >>  
> >>  int scmi_notification_init(struct scmi_handle *handle);
> >> @@ -65,5 +72,7 @@ int scmi_register_protocol_events(const struct scmi_handle *handle,
> >>  				  const struct scmi_protocol_event_ops *ops,
> >>  				  const struct scmi_event *evt, int num_events,
> >>  				  int num_sources);
> >> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
> >> +		const void *buf, size_t len, u64 ts);
> >>  
> >>  #endif /* _SCMI_NOTIFY_H */  
> > 
> >   
>
Lukasz Luba March 12, 2020, 1:51 p.m. UTC | #4
Hi Cristian,

just one comment below...

On 3/4/20 4:25 PM, Cristian Marussi wrote:
> Add core SCMI Notifications dispatch and delivery support logic which is
> able, at first, to dispatch well-known received events from the RX ISR to
> the dedicated deferred worker, and then, from there, to final deliver the
> events to the registered users' callbacks.
> 
> Dispatch and delivery is just added here, still not enabled.
> 
> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
> ---
> V3 --> V4
> - dispatcher now handles dequeuing of events in chunks (header+payload):
>    handling of these in_flight events let us remove one unneeded memcpy
>    on RX interrupt path (scmi_notify)
> - deferred dispatcher now access their own per-protocol handlers' table
>    reducing locking contention on the RX path
> V2 --> V3
> - exposing wq in sysfs via WQ_SYSFS
> V1 --> V2
> - splitted out of V1 patch 04
> - moved from IDR maps to real HashTables to store event_handlers
> - simplified delivery logic
> ---
>   drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>   drivers/firmware/arm_scmi/notify.h |   9 +
>   2 files changed, 342 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c

[snip]

> +
> +/**
> + * scmi_notify  - Queues a notification for further deferred processing
> + *
> + * This is called in interrupt context to queue a received event for
> + * deferred processing.
> + *
> + * @handle: The handle identifying the platform instance from which the
> + *	    dispatched event is generated
> + * @proto_id: Protocol ID
> + * @evt_id: Event ID (msgID)
> + * @buf: Event Message Payload (without the header)
> + * @len: Event Message Payload size
> + * @ts: RX Timestamp in nanoseconds (boottime)
> + *
> + * Return: 0 on Success
> + */
> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
> +		const void *buf, size_t len, u64 ts)
> +{
> +	struct scmi_registered_event *r_evt;
> +	struct scmi_event_header eh;
> +	struct scmi_notify_instance *ni = handle->notify_priv;
> +
> +	/* Ensure atomic value is updated */
> +	smp_mb__before_atomic();
> +	if (unlikely(!atomic_read(&ni->enabled)))
> +		return 0;
> +
> +	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
> +	if (unlikely(!r_evt))
> +		return -EINVAL;
> +
> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
> +		pr_err("SCMI Notifications: discard badly sized message\n");
> +		return -EINVAL;
> +	}
> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
> +		     sizeof(eh) + len)) {
> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
> +			proto_id, evt_id, ts);
> +		return -ENOMEM;
> +	}
> +
> +	eh.timestamp = ts;
> +	eh.evt_id = evt_id;
> +	eh.payld_sz = len;
> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> +	queue_work(r_evt->proto->equeue.wq,
> +		   &r_evt->proto->equeue.notify_work);

Is it safe to ignore the return value from the queue_work here?

Regards,
Lukasz
Lukasz Luba March 12, 2020, 2:06 p.m. UTC | #5
On 3/12/20 1:51 PM, Lukasz Luba wrote:
> Hi Cristian,
> 
> just one comment below...
> 
> On 3/4/20 4:25 PM, Cristian Marussi wrote:
>> Add core SCMI Notifications dispatch and delivery support logic which is
>> able, at first, to dispatch well-known received events from the RX ISR to
>> the dedicated deferred worker, and then, from there, to final deliver the
>> events to the registered users' callbacks.
>>
>> Dispatch and delivery is just added here, still not enabled.
>>
>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
>> ---
>> V3 --> V4
>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>    handling of these in_flight events let us remove one unneeded memcpy
>>    on RX interrupt path (scmi_notify)
>> - deferred dispatcher now access their own per-protocol handlers' table
>>    reducing locking contention on the RX path
>> V2 --> V3
>> - exposing wq in sysfs via WQ_SYSFS
>> V1 --> V2
>> - splitted out of V1 patch 04
>> - moved from IDR maps to real HashTables to store event_handlers
>> - simplified delivery logic
>> ---
>>   drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>   drivers/firmware/arm_scmi/notify.h |   9 +
>>   2 files changed, 342 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/firmware/arm_scmi/notify.c 
>> b/drivers/firmware/arm_scmi/notify.c
> 
> [snip]
> 
>> +
>> +/**
>> + * scmi_notify  - Queues a notification for further deferred processing
>> + *
>> + * This is called in interrupt context to queue a received event for
>> + * deferred processing.
>> + *
>> + * @handle: The handle identifying the platform instance from which the
>> + *        dispatched event is generated
>> + * @proto_id: Protocol ID
>> + * @evt_id: Event ID (msgID)
>> + * @buf: Event Message Payload (without the header)
>> + * @len: Event Message Payload size
>> + * @ts: RX Timestamp in nanoseconds (boottime)
>> + *
>> + * Return: 0 on Success
>> + */
>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 
>> evt_id,
>> +        const void *buf, size_t len, u64 ts)
>> +{
>> +    struct scmi_registered_event *r_evt;
>> +    struct scmi_event_header eh;
>> +    struct scmi_notify_instance *ni = handle->notify_priv;
>> +
>> +    /* Ensure atomic value is updated */
>> +    smp_mb__before_atomic();
>> +    if (unlikely(!atomic_read(&ni->enabled)))
>> +        return 0;
>> +
>> +    r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
>> +    if (unlikely(!r_evt))
>> +        return -EINVAL;
>> +
>> +    if (unlikely(len > r_evt->evt->max_payld_sz)) {
>> +        pr_err("SCMI Notifications: discard badly sized message\n");
>> +        return -EINVAL;
>> +    }
>> +    if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>> +             sizeof(eh) + len)) {
>> +        pr_warn("SCMI Notifications: queue full dropping proto_id:%d  
>> evt_id:%d  ts:%lld\n",
>> +            proto_id, evt_id, ts);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    eh.timestamp = ts;
>> +    eh.evt_id = evt_id;
>> +    eh.payld_sz = len;
>> +    kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>> +    kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>> +    queue_work(r_evt->proto->equeue.wq,
>> +           &r_evt->proto->equeue.notify_work);
> 
> Is it safe to ignore the return value from the queue_work here?

and also from the kfifo_in

> 
> Regards,
> Lukasz
> 
>
Cristian Marussi March 12, 2020, 6:34 p.m. UTC | #6
On 12/03/2020 13:51, Lukasz Luba wrote:
> Hi Cristian,
> 
> just one comment below...

Hi Lukasz

Thanks for the review

> 
> On 3/4/20 4:25 PM, Cristian Marussi wrote:
>> Add core SCMI Notifications dispatch and delivery support logic which is
>> able, at first, to dispatch well-known received events from the RX ISR to
>> the dedicated deferred worker, and then, from there, to final deliver the
>> events to the registered users' callbacks.
>>
>> Dispatch and delivery is just added here, still not enabled.
>>
>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
>> ---
>> V3 --> V4
>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>    handling of these in_flight events let us remove one unneeded memcpy
>>    on RX interrupt path (scmi_notify)
>> - deferred dispatcher now access their own per-protocol handlers' table
>>    reducing locking contention on the RX path
>> V2 --> V3
>> - exposing wq in sysfs via WQ_SYSFS
>> V1 --> V2
>> - splitted out of V1 patch 04
>> - moved from IDR maps to real HashTables to store event_handlers
>> - simplified delivery logic
>> ---
>>   drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>   drivers/firmware/arm_scmi/notify.h |   9 +
>>   2 files changed, 342 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
> 
> [snip]
> 
>> +
>> +/**
>> + * scmi_notify  - Queues a notification for further deferred processing
>> + *
>> + * This is called in interrupt context to queue a received event for
>> + * deferred processing.
>> + *
>> + * @handle: The handle identifying the platform instance from which the
>> + *	    dispatched event is generated
>> + * @proto_id: Protocol ID
>> + * @evt_id: Event ID (msgID)
>> + * @buf: Event Message Payload (without the header)
>> + * @len: Event Message Payload size
>> + * @ts: RX Timestamp in nanoseconds (boottime)
>> + *
>> + * Return: 0 on Success
>> + */
>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
>> +		const void *buf, size_t len, u64 ts)
>> +{
>> +	struct scmi_registered_event *r_evt;
>> +	struct scmi_event_header eh;
>> +	struct scmi_notify_instance *ni = handle->notify_priv;
>> +
>> +	/* Ensure atomic value is updated */
>> +	smp_mb__before_atomic();
>> +	if (unlikely(!atomic_read(&ni->enabled)))
>> +		return 0;
>> +
>> +	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
>> +	if (unlikely(!r_evt))
>> +		return -EINVAL;
>> +
>> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
>> +		pr_err("SCMI Notifications: discard badly sized message\n");
>> +		return -EINVAL;
>> +	}
>> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>> +		     sizeof(eh) + len)) {
>> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
>> +			proto_id, evt_id, ts);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	eh.timestamp = ts;
>> +	eh.evt_id = evt_id;
>> +	eh.payld_sz = len;
>> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>> +	queue_work(r_evt->proto->equeue.wq,
>> +		   &r_evt->proto->equeue.notify_work);
> 
> Is it safe to ignore the return value from the queue_work here?
> 

In fact yes, we do not want to care: it returns true or false depending on the
fact that the specific work was or not already queued, and we just rely on
this behavior to keep kicking the worker only when needed but never kick
more than one instance of it per-queue (so that there's only one reader
wq and one writer here in the scmi_notify)...explaining better:

1. we push an event (hdr+payld) to the protocol queue if we found that there was
enough space on the queue

2a. if at the time of the kfifo_in( ) the worker was already running
(queue not empty) it will process our new event sooner or later and here
the queue_work will return false, but we do not care in fact ... we
tried to kick it just in case

2b. if instead at the time of the kfifo_in() the queue was empty the worker would
have probably already gone to the sleep and this queue_work() will return true and
so this time it will effectively wake up the worker to process our items

The important thing here is that we are sure to wakeup the worker when needed
but we are equally sure we are never causing the scheduling of more than one worker
thread consuming from the same queue (because that would break the one reader/one writer
assumption which let us use the fifo in a lockless manner): this is possible because
queue_work checks if the required work item is already pending and in such a case backs
out returning false and we have one work_item (notify_work) defined per-protocol and
so per-queue.

Now probably I wrote too much of an explanation and confuse stuff more ... :D

Regards

Cristian

> Regards,
> Lukasz
> 
>
Cristian Marussi March 12, 2020, 7:24 p.m. UTC | #7
On 12/03/2020 14:06, Lukasz Luba wrote:
> 
> 
> On 3/12/20 1:51 PM, Lukasz Luba wrote:
>> Hi Cristian,
>>

Hi Lukasz

>> just one comment below...
>>
>> On 3/4/20 4:25 PM, Cristian Marussi wrote:
>>> Add core SCMI Notifications dispatch and delivery support logic which is
>>> able, at first, to dispatch well-known received events from the RX ISR to
>>> the dedicated deferred worker, and then, from there, to final deliver the
>>> events to the registered users' callbacks.
>>>
>>> Dispatch and delivery is just added here, still not enabled.
>>>
>>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
>>> ---
>>> V3 --> V4
>>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>>    handling of these in_flight events let us remove one unneeded memcpy
>>>    on RX interrupt path (scmi_notify)
>>> - deferred dispatcher now access their own per-protocol handlers' table
>>>    reducing locking contention on the RX path
>>> V2 --> V3
>>> - exposing wq in sysfs via WQ_SYSFS
>>> V1 --> V2
>>> - splitted out of V1 patch 04
>>> - moved from IDR maps to real HashTables to store event_handlers
>>> - simplified delivery logic
>>> ---
>>>   drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>>   drivers/firmware/arm_scmi/notify.h |   9 +
>>>   2 files changed, 342 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/firmware/arm_scmi/notify.c 
>>> b/drivers/firmware/arm_scmi/notify.c
>>
>> [snip]
>>
>>> +
>>> +/**
>>> + * scmi_notify  - Queues a notification for further deferred processing
>>> + *
>>> + * This is called in interrupt context to queue a received event for
>>> + * deferred processing.
>>> + *
>>> + * @handle: The handle identifying the platform instance from which the
>>> + *        dispatched event is generated
>>> + * @proto_id: Protocol ID
>>> + * @evt_id: Event ID (msgID)
>>> + * @buf: Event Message Payload (without the header)
>>> + * @len: Event Message Payload size
>>> + * @ts: RX Timestamp in nanoseconds (boottime)
>>> + *
>>> + * Return: 0 on Success
>>> + */
>>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 
>>> evt_id,
>>> +        const void *buf, size_t len, u64 ts)
>>> +{
>>> +    struct scmi_registered_event *r_evt;
>>> +    struct scmi_event_header eh;
>>> +    struct scmi_notify_instance *ni = handle->notify_priv;
>>> +
>>> +    /* Ensure atomic value is updated */
>>> +    smp_mb__before_atomic();
>>> +    if (unlikely(!atomic_read(&ni->enabled)))
>>> +        return 0;
>>> +
>>> +    r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
>>> +    if (unlikely(!r_evt))
>>> +        return -EINVAL;
>>> +
>>> +    if (unlikely(len > r_evt->evt->max_payld_sz)) {
>>> +        pr_err("SCMI Notifications: discard badly sized message\n");
>>> +        return -EINVAL;
>>> +    }
>>> +    if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>>> +             sizeof(eh) + len)) {
>>> +        pr_warn("SCMI Notifications: queue full dropping proto_id:%d  
>>> evt_id:%d  ts:%lld\n",
>>> +            proto_id, evt_id, ts);
>>> +        return -ENOMEM;
>>> +    }
>>> +
>>> +    eh.timestamp = ts;
>>> +    eh.evt_id = evt_id;
>>> +    eh.payld_sz = len;
>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>> +    queue_work(r_evt->proto->equeue.wq,
>>> +           &r_evt->proto->equeue.notify_work);
>>
>> Is it safe to ignore the return value from the queue_work here?
> 
> and also from the kfifo_in
> 

kfifo_in returns the number of effectively written bytes (using __kfifo_in),
possibly capped to the effectively maximum available space in the fifo, BUT since I
absolutely cannot afford to write an incomplete/truncated event into the queue, I check
that in advance and backout on queue full:

if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) < sizeof(eh) + len)) {
	return -ENOMEM;

and given that the ISR scmi_notify() is the only possible writer on this queue
I can be sure that the kfifo_in() will succeed in writing the required number of
bytes after the above check...so I don't need to check the return value.

Regards

Cristian

>>
>> Regards,
>> Lukasz
>>
>>
Lukasz Luba March 12, 2020, 8:57 p.m. UTC | #8
On 3/12/20 7:24 PM, Cristian Marussi wrote:
> On 12/03/2020 14:06, Lukasz Luba wrote:
>>
>>
>> On 3/12/20 1:51 PM, Lukasz Luba wrote:
>>> Hi Cristian,
>>>
> 
> Hi Lukasz
> 
>>> just one comment below...
>>>
>>> On 3/4/20 4:25 PM, Cristian Marussi wrote:
>>>> Add core SCMI Notifications dispatch and delivery support logic which is
>>>> able, at first, to dispatch well-known received events from the RX ISR to
>>>> the dedicated deferred worker, and then, from there, to final deliver the
>>>> events to the registered users' callbacks.
>>>>
>>>> Dispatch and delivery is just added here, still not enabled.
>>>>
>>>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
>>>> ---
>>>> V3 --> V4
>>>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>>>     handling of these in_flight events let us remove one unneeded memcpy
>>>>     on RX interrupt path (scmi_notify)
>>>> - deferred dispatcher now access their own per-protocol handlers' table
>>>>     reducing locking contention on the RX path
>>>> V2 --> V3
>>>> - exposing wq in sysfs via WQ_SYSFS
>>>> V1 --> V2
>>>> - splitted out of V1 patch 04
>>>> - moved from IDR maps to real HashTables to store event_handlers
>>>> - simplified delivery logic
>>>> ---
>>>>    drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>>>    drivers/firmware/arm_scmi/notify.h |   9 +
>>>>    2 files changed, 342 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/firmware/arm_scmi/notify.c
>>>> b/drivers/firmware/arm_scmi/notify.c
>>>
>>> [snip]
>>>
>>>> +
>>>> +/**
>>>> + * scmi_notify  - Queues a notification for further deferred processing
>>>> + *
>>>> + * This is called in interrupt context to queue a received event for
>>>> + * deferred processing.
>>>> + *
>>>> + * @handle: The handle identifying the platform instance from which the
>>>> + *        dispatched event is generated
>>>> + * @proto_id: Protocol ID
>>>> + * @evt_id: Event ID (msgID)
>>>> + * @buf: Event Message Payload (without the header)
>>>> + * @len: Event Message Payload size
>>>> + * @ts: RX Timestamp in nanoseconds (boottime)
>>>> + *
>>>> + * Return: 0 on Success
>>>> + */
>>>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8
>>>> evt_id,
>>>> +        const void *buf, size_t len, u64 ts)
>>>> +{
>>>> +    struct scmi_registered_event *r_evt;
>>>> +    struct scmi_event_header eh;
>>>> +    struct scmi_notify_instance *ni = handle->notify_priv;
>>>> +
>>>> +    /* Ensure atomic value is updated */
>>>> +    smp_mb__before_atomic();
>>>> +    if (unlikely(!atomic_read(&ni->enabled)))
>>>> +        return 0;
>>>> +
>>>> +    r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
>>>> +    if (unlikely(!r_evt))
>>>> +        return -EINVAL;
>>>> +
>>>> +    if (unlikely(len > r_evt->evt->max_payld_sz)) {
>>>> +        pr_err("SCMI Notifications: discard badly sized message\n");
>>>> +        return -EINVAL;
>>>> +    }
>>>> +    if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>>>> +             sizeof(eh) + len)) {
>>>> +        pr_warn("SCMI Notifications: queue full dropping proto_id:%d
>>>> evt_id:%d  ts:%lld\n",
>>>> +            proto_id, evt_id, ts);
>>>> +        return -ENOMEM;
>>>> +    }
>>>> +
>>>> +    eh.timestamp = ts;
>>>> +    eh.evt_id = evt_id;
>>>> +    eh.payld_sz = len;
>>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>>> +    queue_work(r_evt->proto->equeue.wq,
>>>> +           &r_evt->proto->equeue.notify_work);
>>>
>>> Is it safe to ignore the return value from the queue_work here?
>>
>> and also from the kfifo_in
>>
> 
> kfifo_in returns the number of effectively written bytes (using __kfifo_in),
> possibly capped to the effectively maximum available space in the fifo, BUT since I
> absolutely cannot afford to write an incomplete/truncated event into the queue, I check
> that in advance and backout on queue full:
> 
> if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) < sizeof(eh) + len)) {
> 	return -ENOMEM;
> 
> and given that the ISR scmi_notify() is the only possible writer on this queue

Yes, your are right, no other IRQ will show up for this channel till
we exit mailbox rx callback and clean the bits.

> I can be sure that the kfifo_in() will succeed in writing the required number of
> bytes after the above check...so I don't need to check the return value.
> 
> Regards
> 
> Cristian
> 
>>>
>>> Regards,
>>> Lukasz
>>>
>>>
>
Lukasz Luba March 12, 2020, 9:43 p.m. UTC | #9
On 3/12/20 6:34 PM, Cristian Marussi wrote:
> On 12/03/2020 13:51, Lukasz Luba wrote:
>> Hi Cristian,
>>
>> just one comment below...
> 
> Hi Lukasz
> 
> Thanks for the review
> 
>>
>> On 3/4/20 4:25 PM, Cristian Marussi wrote:
>>> Add core SCMI Notifications dispatch and delivery support logic which is
>>> able, at first, to dispatch well-known received events from the RX ISR to
>>> the dedicated deferred worker, and then, from there, to final deliver the
>>> events to the registered users' callbacks.
>>>
>>> Dispatch and delivery is just added here, still not enabled.
>>>
>>> Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
>>> ---
>>> V3 --> V4
>>> - dispatcher now handles dequeuing of events in chunks (header+payload):
>>>     handling of these in_flight events let us remove one unneeded memcpy
>>>     on RX interrupt path (scmi_notify)
>>> - deferred dispatcher now access their own per-protocol handlers' table
>>>     reducing locking contention on the RX path
>>> V2 --> V3
>>> - exposing wq in sysfs via WQ_SYSFS
>>> V1 --> V2
>>> - splitted out of V1 patch 04
>>> - moved from IDR maps to real HashTables to store event_handlers
>>> - simplified delivery logic
>>> ---
>>>    drivers/firmware/arm_scmi/notify.c | 334 ++++++++++++++++++++++++++++-
>>>    drivers/firmware/arm_scmi/notify.h |   9 +
>>>    2 files changed, 342 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
>>
>> [snip]
>>
>>> +
>>> +/**
>>> + * scmi_notify  - Queues a notification for further deferred processing
>>> + *
>>> + * This is called in interrupt context to queue a received event for
>>> + * deferred processing.
>>> + *
>>> + * @handle: The handle identifying the platform instance from which the
>>> + *	    dispatched event is generated
>>> + * @proto_id: Protocol ID
>>> + * @evt_id: Event ID (msgID)
>>> + * @buf: Event Message Payload (without the header)
>>> + * @len: Event Message Payload size
>>> + * @ts: RX Timestamp in nanoseconds (boottime)
>>> + *
>>> + * Return: 0 on Success
>>> + */
>>> +int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
>>> +		const void *buf, size_t len, u64 ts)
>>> +{
>>> +	struct scmi_registered_event *r_evt;
>>> +	struct scmi_event_header eh;
>>> +	struct scmi_notify_instance *ni = handle->notify_priv;
>>> +
>>> +	/* Ensure atomic value is updated */
>>> +	smp_mb__before_atomic();
>>> +	if (unlikely(!atomic_read(&ni->enabled)))
>>> +		return 0;
>>> +
>>> +	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
>>> +	if (unlikely(!r_evt))
>>> +		return -EINVAL;
>>> +
>>> +	if (unlikely(len > r_evt->evt->max_payld_sz)) {
>>> +		pr_err("SCMI Notifications: discard badly sized message\n");
>>> +		return -EINVAL;
>>> +	}
>>> +	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
>>> +		     sizeof(eh) + len)) {
>>> +		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
>>> +			proto_id, evt_id, ts);
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	eh.timestamp = ts;
>>> +	eh.evt_id = evt_id;
>>> +	eh.payld_sz = len;
>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>> +	queue_work(r_evt->proto->equeue.wq,
>>> +		   &r_evt->proto->equeue.notify_work);
>>
>> Is it safe to ignore the return value from the queue_work here?
>>
> 
> In fact yes, we do not want to care: it returns true or false depending on the
> fact that the specific work was or not already queued, and we just rely on
> this behavior to keep kicking the worker only when needed but never kick
> more than one instance of it per-queue (so that there's only one reader
> wq and one writer here in the scmi_notify)...explaining better:
> 
> 1. we push an event (hdr+payld) to the protocol queue if we found that there was
> enough space on the queue
> 
> 2a. if at the time of the kfifo_in( ) the worker was already running
> (queue not empty) it will process our new event sooner or later and here
> the queue_work will return false, but we do not care in fact ... we
> tried to kick it just in case
> 
> 2b. if instead at the time of the kfifo_in() the queue was empty the worker would
> have probably already gone to the sleep and this queue_work() will return true and
> so this time it will effectively wake up the worker to process our items
> 
> The important thing here is that we are sure to wakeup the worker when needed
> but we are equally sure we are never causing the scheduling of more than one worker
> thread consuming from the same queue (because that would break the one reader/one writer
> assumption which let us use the fifo in a lockless manner): this is possible because
> queue_work checks if the required work item is already pending and in such a case backs
> out returning false and we have one work_item (notify_work) defined per-protocol and
> so per-queue.

I see. That's a good assumption: one work_item per protocol and simplify
the locking. What if there would be an edge case scenario when the
consumer (work_item) has handled the last item (there was NULL from 
scmi_process_event_header()), while in meantime scmi_notify put into
the fifo new event but couldn't kick the queue_work. Would it stay there
till the next IRQ which triggers queue_work to consume two events (one
potentially a bit old)? Or we can ignore such race situation assuming
that cleaning of work item is instant and kfifo_in is slow?

> 
> Now probably I wrote too much of an explanation and confuse stuff more ... :D

No, thank you for the detailed explanation. I will continue my review.

Regards,
Lukasz
Cristian Marussi March 16, 2020, 2:46 p.m. UTC | #10
On Thu, Mar 12, 2020 at 09:43:31PM +0000, Lukasz Luba wrote:
> 
> 
> On 3/12/20 6:34 PM, Cristian Marussi wrote:
> > On 12/03/2020 13:51, Lukasz Luba wrote:
> > > Hi Cristian,
> > > 
Hi Lukasz

> > > just one comment below...
[snip]
> > > > +	eh.timestamp = ts;
> > > > +	eh.evt_id = evt_id;
> > > > +	eh.payld_sz = len;
> > > > +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
> > > > +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> > > > +	queue_work(r_evt->proto->equeue.wq,
> > > > +		   &r_evt->proto->equeue.notify_work);
> > > 
> > > Is it safe to ignore the return value from the queue_work here?
> > > 
> > 
> > In fact yes, we do not want to care: it returns true or false depending on the
> > fact that the specific work was or not already queued, and we just rely on
> > this behavior to keep kicking the worker only when needed but never kick
> > more than one instance of it per-queue (so that there's only one reader
> > wq and one writer here in the scmi_notify)...explaining better:
> > 
> > 1. we push an event (hdr+payld) to the protocol queue if we found that there was
> > enough space on the queue
> > 
> > 2a. if at the time of the kfifo_in( ) the worker was already running
> > (queue not empty) it will process our new event sooner or later and here
> > the queue_work will return false, but we do not care in fact ... we
> > tried to kick it just in case
> > 
> > 2b. if instead at the time of the kfifo_in() the queue was empty the worker would
> > have probably already gone to the sleep and this queue_work() will return true and
> > so this time it will effectively wake up the worker to process our items
> > 
> > The important thing here is that we are sure to wakeup the worker when needed
> > but we are equally sure we are never causing the scheduling of more than one worker
> > thread consuming from the same queue (because that would break the one reader/one writer
> > assumption which let us use the fifo in a lockless manner): this is possible because
> > queue_work checks if the required work item is already pending and in such a case backs
> > out returning false and we have one work_item (notify_work) defined per-protocol and
> > so per-queue.
> 
> I see. That's a good assumption: one work_item per protocol and simplify
> the locking. What if there would be an edge case scenario when the
> consumer (work_item) has handled the last item (there was NULL from
> scmi_process_event_header()), while in meantime scmi_notify put into
> the fifo new event but couldn't kick the queue_work. Would it stay there
> till the next IRQ which triggers queue_work to consume two events (one
> potentially a bit old)? Or we can ignore such race situation assuming
> that cleaning of work item is instant and kfifo_in is slow?
> 

In fact, this is a very good point, since between the moment the worker
determines that the queue is empty and the moment in which the worker
effectively exits (and it's marked as no more pending by the Kernel cmwq)
there is a window of opportunity for a race in which the ISR could fill
the queue with one more event and then fail to kick with queue_work() since
the work is in fact still nominally marked as pending from the point of view
of Kernel cmwq, as below:

ISR (core N)		|	WQ (core N+1)		cmwq flags	       	queued events
------------------------------------------------------------------------------------------------
			| if (queue_is_empty)		- WORK_PENDING		0 events queued
			+     ...			- WORK_PENDING		0 events queued
			+ } while (scmi_process_event_payload);
			+}// worker function exit 
kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
queue_work()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
  -> FALSE (pending)	+     ...cmwq backing out	- WORK_PENDING		1 events queued
			+     ...cmwq backing out	- WORK_PENDING		1 events queued
			+     ...cmwq backing out	- WORK_PENDING		1 events queued
			| ---- WORKER THREAD EXIT	- !WORK_PENDING		1 events queued
			| 		 		- !WORK_PENDING		1 events queued
kfifo_in()		|     				- !WORK_PENDING		2 events queued
kfifo_in()		|  				- !WORK_PENDING		2 events queued
queue_work()		|     				- !WORK_PENDING		2 events queued
   -> TRUE		| --- WORKER ENTER		- WORK_PENDING		2 events queued
			|  				- WORK_PENDING		2 events consumed
		
where effectively the last event queued won't be consumed till the next
iteration once another event is queued.

Given the fact that the ISR and the dedicated WQ on an SMP run effectively
in parallel I do not think unfortunately that we can simply count on the fact
the worker exit is faster than the kifos_in, enough to close the race window
opportunity. (even if rare)

On the other side considering the impact of such scenario, I can imagine that
it's not simply that we could only have a delayed delivery, but we must consider
that if the delayed event is effectively the last one ever it would remain
undelivered forever; this is particularly worrying in a scenario in which such
last event is particularly important: imagine a system shutdown where a last
system-power-off remains undelivered.

As a consequence I think this rare racy condition should be addressed somehow.

Looking at this scenario, it seems the classic situation in which you want to
use some sort of completion to avoid missing out on events delivery, BUT in our
usecase:

- placing the workers loaned from cmwq into an unbounded wait_for_completion()
  once the queue is empty seems not the best to use resources (and probably
  frowned upon)....using a few dedicated kernel threads to simply let them idle
  waiting most of the time seems equally frowned upon (I could be wrong...))
- the needed complete() in the ISR would introduce a spinlock_irqsave into the
  interrupt path (there's already one inside queue_work in fact) so it is not
  desirable, at least not if used on a regular base (for each event notified)

So I was thinking to try to reduce sensibly the above race window, more
than eliminate it completely, by adding an early flag to be checked under
specific conditions in order to retry the queue_work a few times when the race
is hit, something like:

ISR (core N)		|	WQ (core N+1)
-------------------------------------------------------------------------------
			| atomic_set(&exiting, 0);
			|
			| do {
			|	...
			| 	if (queue_is_empty)		- WORK_PENDING		0 events queued
			+          atomic_set(&exiting, 1)	- WORK_PENDING		0 events queued
static int cnt=3	|          --> breakout of while	- WORK_PENDING		0 events queued
kfifo_in()		|	....
			| } while (scmi_process_event_payload);
kfifo_in()		|
exiting = atomic_read()	|     ...cmwq backing out		- WORK_PENDING		1 events queued
do {			|     ...cmwq backing out		- WORK_PENDING		1 events queued
    ret = queue_work() 	|     ...cmwq backing out		- WORK_PENDING		1 events queued
    if (ret || !exiting)|     ...cmwq backing out		- WORK_PENDING		1 events queued
	break;		|     ...cmwq backing out		- WORK_PENDING		1 events queued
    mdelay(5);		|     ...cmwq backing out		- WORK_PENDING		1 events queued
    exiting =		|     ...cmwq backing out		- WORK_PENDING		1 events queued
      atomic_read;	|     ...cmwq backing out		- WORK_PENDING		1 events queued
} while (--cnt);	|     ...cmwq backing out		- WORK_PENDING		1 events queued
			| ---- WORKER EXIT 			- !WORK_PENDING		0 events queued

like down below between the scissors.

Not tested or tried....I could be missing something...and the mdelay is horrible (and not
the cleanest thing you've ever seen probably :D)...I'll have a chat with Sudeep too.

> > 
> > Now probably I wrote too much of an explanation and confuse stuff more ... :D
> 
> No, thank you for the detailed explanation. I will continue my review.
> 

Thanks

Regards

Cristian



-->8-----------------
diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 9eb6b8b71bac..8719e077358c 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -223,6 +223,7 @@ struct scmi_notify_instance {
  */
 struct events_queue {
 	size_t				sz;
+	atomic_t			exiting;
 	struct kfifo			kfifo;
 	struct work_struct		notify_work;
 	struct workqueue_struct		*wq;
@@ -406,11 +407,16 @@ scmi_process_event_header(struct events_queue *eq,
 
 	outs = kfifo_out(&eq->kfifo, pd->eh,
 			 sizeof(struct scmi_event_header));
-	if (!outs)
+	if (!outs) {
+		atomic_set(&eq->exiting, 1);
+		smp_mb__after_atomic();
 		return NULL;
+	}
 	if (outs != sizeof(struct scmi_event_header)) {
 		pr_err("SCMI Notifications: corrupted EVT header. Flush.\n");
 		kfifo_reset_out(&eq->kfifo);
+		atomic_set(&eq->exiting, 1);
+		smp_mb__after_atomic();
 		return NULL;
 	}
 
@@ -446,6 +452,8 @@ scmi_process_event_payload(struct events_queue *eq,
 	outs = kfifo_out(&eq->kfifo, pd->eh->payld, pd->eh->payld_sz);
 	if (unlikely(!outs)) {
 		pr_warn("--- EMPTY !!!!\n");
+		atomic_set(&eq->exiting, 1);
+		smp_mb__after_atomic();
 		return false;
 	}
 
@@ -455,6 +463,8 @@ scmi_process_event_payload(struct events_queue *eq,
 	if (unlikely(outs != pd->eh->payld_sz)) {
 		pr_err("SCMI Notifications: corrupted EVT Payload. Flush.\n");
 		kfifo_reset_out(&eq->kfifo);
+		atomic_set(&eq->exiting, 1);
+		smp_mb__after_atomic();
 		return false;
 	}
 
@@ -526,6 +536,8 @@ static void scmi_events_dispatcher(struct work_struct *work)
 	mdelay(200);
 
 	eq = container_of(work, struct events_queue, notify_work);
+	atomic_set(&eq->exiting, 0);
+	smp_mb__after_atomic();
 	pd = container_of(eq, struct scmi_registered_protocol_events_desc,
 			  equeue);
 	/*
@@ -579,6 +591,8 @@ static void scmi_events_dispatcher(struct work_struct *work)
 int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
 		const void *buf, size_t len, u64 ts)
 {
+	bool exiting;
+	static u8 cnt = 3;
 	struct scmi_registered_event *r_evt;
 	struct scmi_event_header eh;
 	struct scmi_notify_instance *ni = handle->notify_priv;
@@ -616,8 +630,20 @@ int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
 	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
 	mdelay(30);
 	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
-	queue_work(r_evt->proto->equeue.wq,
-		   &r_evt->proto->equeue.notify_work);
+
+	smp_mb__before_atomic();
+	exiting = atomic_read(&r_evt->proto->equeue.exiting);
+	do {
+		bool ret;
+
+		ret = queue_work(r_evt->proto->equeue.wq,
+				 &r_evt->proto->equeue.notify_work);
+		if (likely(ret || !exiting))
+			break;
+		mdelay(5);
+		smp_mb__before_atomic();
+		exiting = atomic_read(&r_evt->proto->equeue.exiting);
+	} while (--cnt);
 
 	return 0;
 }
@@ -655,6 +681,7 @@ static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
 				       &equeue->kfifo);
 	if (ret)
 		return ret;
+	atomic_set(&equeue->exiting, 0);
 
 	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
 	equeue->wq = ni->notify_wq;
--<8-----------------------
Lukasz Luba March 18, 2020, 8:26 a.m. UTC | #11
Hi Cristian,

On 3/16/20 2:46 PM, Cristian Marussi wrote:
> On Thu, Mar 12, 2020 at 09:43:31PM +0000, Lukasz Luba wrote:
>>
>>
>> On 3/12/20 6:34 PM, Cristian Marussi wrote:
>>> On 12/03/2020 13:51, Lukasz Luba wrote:
>>>> Hi Cristian,
>>>>
> Hi Lukasz
> 
>>>> just one comment below...
> [snip]
>>>>> +	eh.timestamp = ts;
>>>>> +	eh.evt_id = evt_id;
>>>>> +	eh.payld_sz = len;
>>>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>>>> +	queue_work(r_evt->proto->equeue.wq,
>>>>> +		   &r_evt->proto->equeue.notify_work);
>>>>
>>>> Is it safe to ignore the return value from the queue_work here?
>>>>
>>>
>>> In fact yes, we do not want to care: it returns true or false depending on the
>>> fact that the specific work was or not already queued, and we just rely on
>>> this behavior to keep kicking the worker only when needed but never kick
>>> more than one instance of it per-queue (so that there's only one reader
>>> wq and one writer here in the scmi_notify)...explaining better:
>>>
>>> 1. we push an event (hdr+payld) to the protocol queue if we found that there was
>>> enough space on the queue
>>>
>>> 2a. if at the time of the kfifo_in( ) the worker was already running
>>> (queue not empty) it will process our new event sooner or later and here
>>> the queue_work will return false, but we do not care in fact ... we
>>> tried to kick it just in case
>>>
>>> 2b. if instead at the time of the kfifo_in() the queue was empty the worker would
>>> have probably already gone to the sleep and this queue_work() will return true and
>>> so this time it will effectively wake up the worker to process our items
>>>
>>> The important thing here is that we are sure to wakeup the worker when needed
>>> but we are equally sure we are never causing the scheduling of more than one worker
>>> thread consuming from the same queue (because that would break the one reader/one writer
>>> assumption which let us use the fifo in a lockless manner): this is possible because
>>> queue_work checks if the required work item is already pending and in such a case backs
>>> out returning false and we have one work_item (notify_work) defined per-protocol and
>>> so per-queue.
>>
>> I see. That's a good assumption: one work_item per protocol and simplify
>> the locking. What if there would be an edge case scenario when the
>> consumer (work_item) has handled the last item (there was NULL from
>> scmi_process_event_header()), while in meantime scmi_notify put into
>> the fifo new event but couldn't kick the queue_work. Would it stay there
>> till the next IRQ which triggers queue_work to consume two events (one
>> potentially a bit old)? Or we can ignore such race situation assuming
>> that cleaning of work item is instant and kfifo_in is slow?
>>
> 
> In fact, this is a very good point, since between the moment the worker
> determines that the queue is empty and the moment in which the worker
> effectively exits (and it's marked as no more pending by the Kernel cmwq)
> there is a window of opportunity for a race in which the ISR could fill
> the queue with one more event and then fail to kick with queue_work() since
> the work is in fact still nominally marked as pending from the point of view
> of Kernel cmwq, as below:
> 
> ISR (core N)		|	WQ (core N+1)		cmwq flags	       	queued events
> ------------------------------------------------------------------------------------------------
> 			| if (queue_is_empty)		- WORK_PENDING		0 events queued
> 			+     ...			- WORK_PENDING		0 events queued
> 			+ } while (scmi_process_event_payload);
> 			+}// worker function exit
> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
> queue_work()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
>    -> FALSE (pending)	+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			| ---- WORKER THREAD EXIT	- !WORK_PENDING		1 events queued
> 			| 		 		- !WORK_PENDING		1 events queued
> kfifo_in()		|     				- !WORK_PENDING		2 events queued
> kfifo_in()		|  				- !WORK_PENDING		2 events queued
> queue_work()		|     				- !WORK_PENDING		2 events queued
>     -> TRUE		| --- WORKER ENTER		- WORK_PENDING		2 events queued
> 			|  				- WORK_PENDING		2 events consumed
> 		
> where effectively the last event queued won't be consumed till the next
> iteration once another event is queued.
> 
> Given the fact that the ISR and the dedicated WQ on an SMP run effectively
> in parallel I do not think unfortunately that we can simply count on the fact
> the worker exit is faster than the kifos_in, enough to close the race window
> opportunity. (even if rare)
> 
> On the other side considering the impact of such scenario, I can imagine that
> it's not simply that we could only have a delayed delivery, but we must consider
> that if the delayed event is effectively the last one ever it would remain
> undelivered forever; this is particularly worrying in a scenario in which such
> last event is particularly important: imagine a system shutdown where a last
> system-power-off remains undelivered.

Agree, another example could be a thermal notification for some critical
trip point.

> 
> As a consequence I think this rare racy condition should be addressed somehow.
> 
> Looking at this scenario, it seems the classic situation in which you want to
> use some sort of completion to avoid missing out on events delivery, BUT in our
> usecase:
> 
> - placing the workers loaned from cmwq into an unbounded wait_for_completion()
>    once the queue is empty seems not the best to use resources (and probably
>    frowned upon)....using a few dedicated kernel threads to simply let them idle
>    waiting most of the time seems equally frowned upon (I could be wrong...))
> - the needed complete() in the ISR would introduce a spinlock_irqsave into the
>    interrupt path (there's already one inside queue_work in fact) so it is not
>    desirable, at least not if used on a regular base (for each event notified)
> 
> So I was thinking to try to reduce sensibly the above race window, more
> than eliminate it completely, by adding an early flag to be checked under
> specific conditions in order to retry the queue_work a few times when the race
> is hit, something like:
> 
> ISR (core N)		|	WQ (core N+1)
> -------------------------------------------------------------------------------
> 			| atomic_set(&exiting, 0);
> 			|
> 			| do {
> 			|	...
> 			| 	if (queue_is_empty)		- WORK_PENDING		0 events queued
> 			+          atomic_set(&exiting, 1)	- WORK_PENDING		0 events queued
> static int cnt=3	|          --> breakout of while	- WORK_PENDING		0 events queued
> kfifo_in()		|	....
> 			| } while (scmi_process_event_payload);
> kfifo_in()		|
> exiting = atomic_read()	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> do {			|     ...cmwq backing out		- WORK_PENDING		1 events queued
>      ret = queue_work() 	|     ...cmwq backing out		- WORK_PENDING		1 events queued
>      if (ret || !exiting)|     ...cmwq backing out		- WORK_PENDING		1 events queued
> 	break;		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>      mdelay(5);		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>      exiting =		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>        atomic_read;	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> } while (--cnt);	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> 			| ---- WORKER EXIT 			- !WORK_PENDING		0 events queued
> 
> like down below between the scissors.
> 
> Not tested or tried....I could be missing something...and the mdelay is horrible (and not
> the cleanest thing you've ever seen probably :D)...I'll have a chat with Sudeep too.

Indeed it looks more complicated. If you like I can join your offline
discuss when Sudeep is back.

Regards,
Lukasz
Cristian Marussi March 23, 2020, 8:28 a.m. UTC | #12
Hi

On 3/18/20 8:26 AM, Lukasz Luba wrote:
> Hi Cristian,
> 
> On 3/16/20 2:46 PM, Cristian Marussi wrote:
>> On Thu, Mar 12, 2020 at 09:43:31PM +0000, Lukasz Luba wrote:
>>>
>>>
>>> On 3/12/20 6:34 PM, Cristian Marussi wrote:
>>>> On 12/03/2020 13:51, Lukasz Luba wrote:
>>>>> Hi Cristian,
>>>>>
>> Hi Lukasz
>>
>>>>> just one comment below...
>> [snip]
>>>>>> +    eh.timestamp = ts;
>>>>>> +    eh.evt_id = evt_id;
>>>>>> +    eh.payld_sz = len;
>>>>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>>>>> +    kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>>>>> +    queue_work(r_evt->proto->equeue.wq,
>>>>>> +           &r_evt->proto->equeue.notify_work);
>>>>>
>>>>> Is it safe to ignore the return value from the queue_work here?
>>>>>
>>>>
[snip]

>> On the other side considering the impact of such scenario, I can imagine that
>> it's not simply that we could only have a delayed delivery, but we must consider
>> that if the delayed event is effectively the last one ever it would remain
>> undelivered forever; this is particularly worrying in a scenario in which such
>> last event is particularly important: imagine a system shutdown where a last
>> system-power-off remains undelivered.
> 
> Agree, another example could be a thermal notification for some critical
> trip point.
> 
>>
>> As a consequence I think this rare racy condition should be addressed somehow.
>>
>> Looking at this scenario, it seems the classic situation in which you want to
>> use some sort of completion to avoid missing out on events delivery, BUT in our
>> usecase:
>>
>> - placing the workers loaned from cmwq into an unbounded wait_for_completion()
>>    once the queue is empty seems not the best to use resources (and probably
>>    frowned upon)....using a few dedicated kernel threads to simply let them idle
>>    waiting most of the time seems equally frowned upon (I could be wrong...))
>> - the needed complete() in the ISR would introduce a spinlock_irqsave into the
>>    interrupt path (there's already one inside queue_work in fact) so it is not
>>    desirable, at least not if used on a regular base (for each event notified)
>>
>> So I was thinking to try to reduce sensibly the above race window, more
>> than eliminate it completely, by adding an early flag to be checked under
>> specific conditions in order to retry the queue_work a few times when the race
>> is hit, something like:
>>
>> ISR (core N)        |    WQ (core N+1)
>> -------------------------------------------------------------------------------
>>             | atomic_set(&exiting, 0);
>>             |
>>             | do {
>>             |    ...
>>             |     if (queue_is_empty)        - WORK_PENDING        0 events queued
>>             +          atomic_set(&exiting, 1)    - WORK_PENDING        0 events queued
>> static int cnt=3    |          --> breakout of while    - WORK_PENDING        0 events queued
>> kfifo_in()        |    ....
>>             | } while (scmi_process_event_payload);
>> kfifo_in()        |
>> exiting = atomic_read()    |     ...cmwq backing out        - WORK_PENDING        1 events queued
>> do {            |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>      ret = queue_work()     |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>      if (ret || !exiting)|     ...cmwq backing out        - WORK_PENDING        1 events queued
>>     break;        |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>      mdelay(5);        |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>      exiting =        |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>        atomic_read;    |     ...cmwq backing out        - WORK_PENDING        1 events queued
>> } while (--cnt);    |     ...cmwq backing out        - WORK_PENDING        1 events queued
>>             | ---- WORKER EXIT             - !WORK_PENDING        0 events queued
>>
>> like down below between the scissors.
>>
>> Not tested or tried....I could be missing something...and the mdelay is horrible (and not
>> the cleanest thing you've ever seen probably :D)...I'll have a chat with Sudeep too.
> 
> Indeed it looks more complicated. If you like I can join your offline
> discuss when Sudeep is back.
> 
Yes this is as of now my main remaining issue to address for v6.
I'll wait for Sudeep general review/feedback and raise this point.

Regards

Cristian

> Regards,
> Lukasz
Cristian Marussi May 20, 2020, 7:09 a.m. UTC | #13
On Mon, Mar 16, 2020 at 02:46:05PM +0000, Cristian Marussi wrote:
> On Thu, Mar 12, 2020 at 09:43:31PM +0000, Lukasz Luba wrote:
> > 
> > 

Hi Lukasz,

I went back looking deeper into the possible race issue you pointed out a
while ago understanding it a bit better down below.

> > On 3/12/20 6:34 PM, Cristian Marussi wrote:
> > > On 12/03/2020 13:51, Lukasz Luba wrote:
> > > > Hi Cristian,
> > > > 
> Hi Lukasz
> 
> > > > just one comment below...
> [snip]
> > > > > +	eh.timestamp = ts;
> > > > > +	eh.evt_id = evt_id;
> > > > > +	eh.payld_sz = len;
> > > > > +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
> > > > > +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> > > > > +	queue_work(r_evt->proto->equeue.wq,
> > > > > +		   &r_evt->proto->equeue.notify_work);
> > > > 
> > > > Is it safe to ignore the return value from the queue_work here?
> > > > 
> > > 
> > > In fact yes, we do not want to care: it returns true or false depending on the
> > > fact that the specific work was or not already queued, and we just rely on
> > > this behavior to keep kicking the worker only when needed but never kick
> > > more than one instance of it per-queue (so that there's only one reader
> > > wq and one writer here in the scmi_notify)...explaining better:
> > > 
> > > 1. we push an event (hdr+payld) to the protocol queue if we found that there was
> > > enough space on the queue
> > > 
> > > 2a. if at the time of the kfifo_in( ) the worker was already running
> > > (queue not empty) it will process our new event sooner or later and here
> > > the queue_work will return false, but we do not care in fact ... we
> > > tried to kick it just in case
> > > 
> > > 2b. if instead at the time of the kfifo_in() the queue was empty the worker would
> > > have probably already gone to the sleep and this queue_work() will return true and
> > > so this time it will effectively wake up the worker to process our items
> > > 
> > > The important thing here is that we are sure to wakeup the worker when needed
> > > but we are equally sure we are never causing the scheduling of more than one worker
> > > thread consuming from the same queue (because that would break the one reader/one writer
> > > assumption which let us use the fifo in a lockless manner): this is possible because
> > > queue_work checks if the required work item is already pending and in such a case backs
> > > out returning false and we have one work_item (notify_work) defined per-protocol and
> > > so per-queue.
> > 
> > I see. That's a good assumption: one work_item per protocol and simplify
> > the locking. What if there would be an edge case scenario when the
> > consumer (work_item) has handled the last item (there was NULL from
> > scmi_process_event_header()), while in meantime scmi_notify put into
> > the fifo new event but couldn't kick the queue_work. Would it stay there
> > till the next IRQ which triggers queue_work to consume two events (one
> > potentially a bit old)? Or we can ignore such race situation assuming
> > that cleaning of work item is instant and kfifo_in is slow?
> > 
> 
> In fact, this is a very good point, since between the moment the worker
> determines that the queue is empty and the moment in which the worker
> effectively exits (and it's marked as no more pending by the Kernel cmwq)
> there is a window of opportunity for a race in which the ISR could fill
> the queue with one more event and then fail to kick with queue_work() since
> the work is in fact still nominally marked as pending from the point of view
> of Kernel cmwq, as below:
> 
> ISR (core N)		|	WQ (core N+1)		cmwq flags	       	queued events
> ------------------------------------------------------------------------------------------------
> 			| if (queue_is_empty)		- WORK_PENDING		0 events queued
> 			+     ...			- WORK_PENDING		0 events queued
> 			+ } while (scmi_process_event_payload);
> 			+}// worker function exit 
> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
> queue_work()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
>   -> FALSE (pending)	+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
> 			| ---- WORKER THREAD EXIT	- !WORK_PENDING		1 events queued
> 			| 		 		- !WORK_PENDING		1 events queued
> kfifo_in()		|     				- !WORK_PENDING		2 events queued
> kfifo_in()		|  				- !WORK_PENDING		2 events queued
> queue_work()		|     				- !WORK_PENDING		2 events queued
>    -> TRUE		| --- WORKER ENTER		- WORK_PENDING		2 events queued
> 			|  				- WORK_PENDING		2 events consumed
> 		
> where effectively the last event queued won't be consumed till the next
> iteration once another event is queued.
> 

In summary, looking better at Kernel cmwq code, my explanation above about
how the possible race could be exposed by a particular tricky limit condition
and the values assumed by the WORK_STRUCT_PENDING_BIT was ... bullshit :D

In fact there's no race at all because Kernel cmwq takes care to clear the above
PENDING flag BEFORE the user-provided worker-function starts to finally run:
such flag is active only when a work instance is queued pending for execution
but it is cleared just before execution effctively starts.

kernel/workqueue.c:process_one_work()

	set_work_pool_and_clear_pending(work, pool->id);
	....
	worker->current_func(work);

As a consequence in the racy scenario above where the ISR pushes events on the
queues after the worker has already determined the queue to be empty but while
the worker func is still being deactivated in terms of Kernel cmwq internal
handling, it is not a problem since the worker while running is already NO more
marked pending so the queue_work succeeds and a new work will simply be queued
and run once the current instance terminates fully and it is removed from pool.

On the other side in the normal non racy scenario, when the worker is processing
normally a non-empty queue, we'll end-up anyway queueing new items and a new work
from the ISR even if the currently executing one will in fact consume already
naturally the queued items: this will result (it's what I observe in fact) in a
final un-needed quick worker activation/deactivation processing zero items (empty
queue) which is in fact harmless.

Basically the racy condition is taken care by the Kernel cmwq itself, and in fact
there is an extensive explanation also of the barriers employed to properly
realize this in the comments around set_work_pool_and_clear_pending()

I'll add a comment in v8 just to note this behaviour.

Thanks

Cristian

> Given the fact that the ISR and the dedicated WQ on an SMP run effectively
> in parallel I do not think unfortunately that we can simply count on the fact
> the worker exit is faster than the kifos_in, enough to close the race window
> opportunity. (even if rare)
> 
> On the other side considering the impact of such scenario, I can imagine that
> it's not simply that we could only have a delayed delivery, but we must consider
> that if the delayed event is effectively the last one ever it would remain
> undelivered forever; this is particularly worrying in a scenario in which such
> last event is particularly important: imagine a system shutdown where a last
> system-power-off remains undelivered.
> 
> As a consequence I think this rare racy condition should be addressed somehow.
> 
> Looking at this scenario, it seems the classic situation in which you want to
> use some sort of completion to avoid missing out on events delivery, BUT in our
> usecase:
> 
> - placing the workers loaned from cmwq into an unbounded wait_for_completion()
>   once the queue is empty seems not the best to use resources (and probably
>   frowned upon)....using a few dedicated kernel threads to simply let them idle
>   waiting most of the time seems equally frowned upon (I could be wrong...))
> - the needed complete() in the ISR would introduce a spinlock_irqsave into the
>   interrupt path (there's already one inside queue_work in fact) so it is not
>   desirable, at least not if used on a regular base (for each event notified)
> 
> So I was thinking to try to reduce sensibly the above race window, more
> than eliminate it completely, by adding an early flag to be checked under
> specific conditions in order to retry the queue_work a few times when the race
> is hit, something like:
> 
> ISR (core N)		|	WQ (core N+1)
> -------------------------------------------------------------------------------
> 			| atomic_set(&exiting, 0);
> 			|
> 			| do {
> 			|	...
> 			| 	if (queue_is_empty)		- WORK_PENDING		0 events queued
> 			+          atomic_set(&exiting, 1)	- WORK_PENDING		0 events queued
> static int cnt=3	|          --> breakout of while	- WORK_PENDING		0 events queued
> kfifo_in()		|	....
> 			| } while (scmi_process_event_payload);
> kfifo_in()		|
> exiting = atomic_read()	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> do {			|     ...cmwq backing out		- WORK_PENDING		1 events queued
>     ret = queue_work() 	|     ...cmwq backing out		- WORK_PENDING		1 events queued
>     if (ret || !exiting)|     ...cmwq backing out		- WORK_PENDING		1 events queued
> 	break;		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>     mdelay(5);		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>     exiting =		|     ...cmwq backing out		- WORK_PENDING		1 events queued
>       atomic_read;	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> } while (--cnt);	|     ...cmwq backing out		- WORK_PENDING		1 events queued
> 			| ---- WORKER EXIT 			- !WORK_PENDING		0 events queued
> 
> like down below between the scissors.
> 
> Not tested or tried....I could be missing something...and the mdelay is horrible (and not
> the cleanest thing you've ever seen probably :D)...I'll have a chat with Sudeep too.
> 
> > > 
> > > Now probably I wrote too much of an explanation and confuse stuff more ... :D
> > 
> > No, thank you for the detailed explanation. I will continue my review.
> > 
> 
> Thanks
> 
> Regards
> 
> Cristian
> 
> 
> 
> -->8-----------------
> diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
> index 9eb6b8b71bac..8719e077358c 100644
> --- a/drivers/firmware/arm_scmi/notify.c
> +++ b/drivers/firmware/arm_scmi/notify.c
> @@ -223,6 +223,7 @@ struct scmi_notify_instance {
>   */
>  struct events_queue {
>  	size_t				sz;
> +	atomic_t			exiting;
>  	struct kfifo			kfifo;
>  	struct work_struct		notify_work;
>  	struct workqueue_struct		*wq;
> @@ -406,11 +407,16 @@ scmi_process_event_header(struct events_queue *eq,
>  
>  	outs = kfifo_out(&eq->kfifo, pd->eh,
>  			 sizeof(struct scmi_event_header));
> -	if (!outs)
> +	if (!outs) {
> +		atomic_set(&eq->exiting, 1);
> +		smp_mb__after_atomic();
>  		return NULL;
> +	}
>  	if (outs != sizeof(struct scmi_event_header)) {
>  		pr_err("SCMI Notifications: corrupted EVT header. Flush.\n");
>  		kfifo_reset_out(&eq->kfifo);
> +		atomic_set(&eq->exiting, 1);
> +		smp_mb__after_atomic();
>  		return NULL;
>  	}
>  
> @@ -446,6 +452,8 @@ scmi_process_event_payload(struct events_queue *eq,
>  	outs = kfifo_out(&eq->kfifo, pd->eh->payld, pd->eh->payld_sz);
>  	if (unlikely(!outs)) {
>  		pr_warn("--- EMPTY !!!!\n");
> +		atomic_set(&eq->exiting, 1);
> +		smp_mb__after_atomic();
>  		return false;
>  	}
>  
> @@ -455,6 +463,8 @@ scmi_process_event_payload(struct events_queue *eq,
>  	if (unlikely(outs != pd->eh->payld_sz)) {
>  		pr_err("SCMI Notifications: corrupted EVT Payload. Flush.\n");
>  		kfifo_reset_out(&eq->kfifo);
> +		atomic_set(&eq->exiting, 1);
> +		smp_mb__after_atomic();
>  		return false;
>  	}
>  
> @@ -526,6 +536,8 @@ static void scmi_events_dispatcher(struct work_struct *work)
>  	mdelay(200);
>  
>  	eq = container_of(work, struct events_queue, notify_work);
> +	atomic_set(&eq->exiting, 0);
> +	smp_mb__after_atomic();
>  	pd = container_of(eq, struct scmi_registered_protocol_events_desc,
>  			  equeue);
>  	/*
> @@ -579,6 +591,8 @@ static void scmi_events_dispatcher(struct work_struct *work)
>  int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
>  		const void *buf, size_t len, u64 ts)
>  {
> +	bool exiting;
> +	static u8 cnt = 3;
>  	struct scmi_registered_event *r_evt;
>  	struct scmi_event_header eh;
>  	struct scmi_notify_instance *ni = handle->notify_priv;
> @@ -616,8 +630,20 @@ int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
>  	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>  	mdelay(30);
>  	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
> -	queue_work(r_evt->proto->equeue.wq,
> -		   &r_evt->proto->equeue.notify_work);
> +
> +	smp_mb__before_atomic();
> +	exiting = atomic_read(&r_evt->proto->equeue.exiting);
> +	do {
> +		bool ret;
> +
> +		ret = queue_work(r_evt->proto->equeue.wq,
> +				 &r_evt->proto->equeue.notify_work);
> +		if (likely(ret || !exiting))
> +			break;
> +		mdelay(5);
> +		smp_mb__before_atomic();
> +		exiting = atomic_read(&r_evt->proto->equeue.exiting);
> +	} while (--cnt);
>  
>  	return 0;
>  }
> @@ -655,6 +681,7 @@ static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
>  				       &equeue->kfifo);
>  	if (ret)
>  		return ret;
> +	atomic_set(&equeue->exiting, 0);
>  
>  	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
>  	equeue->wq = ni->notify_wq;
> --<8-----------------------
Lukasz Luba May 20, 2020, 10:23 a.m. UTC | #14
Hi Cristian,

On 5/20/20 8:09 AM, Cristian Marussi wrote:
> On Mon, Mar 16, 2020 at 02:46:05PM +0000, Cristian Marussi wrote:
>> On Thu, Mar 12, 2020 at 09:43:31PM +0000, Lukasz Luba wrote:
>>>
>>>
> 
> Hi Lukasz,
> 
> I went back looking deeper into the possible race issue you pointed out a
> while ago understanding it a bit better down below.
> 
>>> On 3/12/20 6:34 PM, Cristian Marussi wrote:
>>>> On 12/03/2020 13:51, Lukasz Luba wrote:
>>>>> Hi Cristian,
>>>>>
>> Hi Lukasz
>>
>>>>> just one comment below...
>> [snip]
>>>>>> +	eh.timestamp = ts;
>>>>>> +	eh.evt_id = evt_id;
>>>>>> +	eh.payld_sz = len;
>>>>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
>>>>>> +	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
>>>>>> +	queue_work(r_evt->proto->equeue.wq,
>>>>>> +		   &r_evt->proto->equeue.notify_work);
>>>>>
>>>>> Is it safe to ignore the return value from the queue_work here?
>>>>>
>>>>
>>>> In fact yes, we do not want to care: it returns true or false depending on the
>>>> fact that the specific work was or not already queued, and we just rely on
>>>> this behavior to keep kicking the worker only when needed but never kick
>>>> more than one instance of it per-queue (so that there's only one reader
>>>> wq and one writer here in the scmi_notify)...explaining better:
>>>>
>>>> 1. we push an event (hdr+payld) to the protocol queue if we found that there was
>>>> enough space on the queue
>>>>
>>>> 2a. if at the time of the kfifo_in( ) the worker was already running
>>>> (queue not empty) it will process our new event sooner or later and here
>>>> the queue_work will return false, but we do not care in fact ... we
>>>> tried to kick it just in case
>>>>
>>>> 2b. if instead at the time of the kfifo_in() the queue was empty the worker would
>>>> have probably already gone to the sleep and this queue_work() will return true and
>>>> so this time it will effectively wake up the worker to process our items
>>>>
>>>> The important thing here is that we are sure to wakeup the worker when needed
>>>> but we are equally sure we are never causing the scheduling of more than one worker
>>>> thread consuming from the same queue (because that would break the one reader/one writer
>>>> assumption which let us use the fifo in a lockless manner): this is possible because
>>>> queue_work checks if the required work item is already pending and in such a case backs
>>>> out returning false and we have one work_item (notify_work) defined per-protocol and
>>>> so per-queue.
>>>
>>> I see. That's a good assumption: one work_item per protocol and simplify
>>> the locking. What if there would be an edge case scenario when the
>>> consumer (work_item) has handled the last item (there was NULL from
>>> scmi_process_event_header()), while in meantime scmi_notify put into
>>> the fifo new event but couldn't kick the queue_work. Would it stay there
>>> till the next IRQ which triggers queue_work to consume two events (one
>>> potentially a bit old)? Or we can ignore such race situation assuming
>>> that cleaning of work item is instant and kfifo_in is slow?
>>>
>>
>> In fact, this is a very good point, since between the moment the worker
>> determines that the queue is empty and the moment in which the worker
>> effectively exits (and it's marked as no more pending by the Kernel cmwq)
>> there is a window of opportunity for a race in which the ISR could fill
>> the queue with one more event and then fail to kick with queue_work() since
>> the work is in fact still nominally marked as pending from the point of view
>> of Kernel cmwq, as below:
>>
>> ISR (core N)		|	WQ (core N+1)		cmwq flags	       	queued events
>> ------------------------------------------------------------------------------------------------
>> 			| if (queue_is_empty)		- WORK_PENDING		0 events queued
>> 			+     ...			- WORK_PENDING		0 events queued
>> 			+ } while (scmi_process_event_payload);
>> 			+}// worker function exit
>> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
>> kfifo_in()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
>> queue_work()		+     ...cmwq backing out	- WORK_PENDING		1 events queued
>>    -> FALSE (pending)	+     ...cmwq backing out	- WORK_PENDING		1 events queued
>> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
>> 			+     ...cmwq backing out	- WORK_PENDING		1 events queued
>> 			| ---- WORKER THREAD EXIT	- !WORK_PENDING		1 events queued
>> 			| 		 		- !WORK_PENDING		1 events queued
>> kfifo_in()		|     				- !WORK_PENDING		2 events queued
>> kfifo_in()		|  				- !WORK_PENDING		2 events queued
>> queue_work()		|     				- !WORK_PENDING		2 events queued
>>     -> TRUE		| --- WORKER ENTER		- WORK_PENDING		2 events queued
>> 			|  				- WORK_PENDING		2 events consumed
>> 		
>> where effectively the last event queued won't be consumed till the next
>> iteration once another event is queued.
>>
> 
> In summary, looking better at Kernel cmwq code, my explanation above about
> how the possible race could be exposed by a particular tricky limit condition
> and the values assumed by the WORK_STRUCT_PENDING_BIT was ... bullshit :D
> 
> In fact there's no race at all because Kernel cmwq takes care to clear the above
> PENDING flag BEFORE the user-provided worker-function starts to finally run:
> such flag is active only when a work instance is queued pending for execution
> but it is cleared just before execution effctively starts.
> 
> kernel/workqueue.c:process_one_work()
> 
> 	set_work_pool_and_clear_pending(work, pool->id);
> 	....
> 	worker->current_func(work);
> 
> As a consequence in the racy scenario above where the ISR pushes events on the
> queues after the worker has already determined the queue to be empty but while
> the worker func is still being deactivated in terms of Kernel cmwq internal
> handling, it is not a problem since the worker while running is already NO more
> marked pending so the queue_work succeeds and a new work will simply be queued
> and run once the current instance terminates fully and it is removed from pool.

Sounds good, thanks to for digging into this workqueue code and figuring 
it out.

> 
> On the other side in the normal non racy scenario, when the worker is processing
> normally a non-empty queue, we'll end-up anyway queueing new items and a new work
> from the ISR even if the currently executing one will in fact consume already
> naturally the queued items: this will result (it's what I observe in fact) in a
> final un-needed quick worker activation/deactivation processing zero items (empty
> queue) which is in fact harmless.
> 
> Basically the racy condition is taken care by the Kernel cmwq itself, and in fact
> there is an extensive explanation also of the barriers employed to properly
> realize this in the comments around set_work_pool_and_clear_pending()
> 
> I'll add a comment in v8 just to note this behaviour.

Great research.

Regards,
Lukasz

> 
> Thanks
> 
> Cristian
>
diff mbox series

Patch

diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index d6c08cce3c63..0854d48d5886 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -44,6 +44,27 @@ 
  * as described in the SCMI Protocol specification, while src_id represents an
  * optional, protocol dependent, source identifier (like domain_id, perf_id
  * or sensor_id and so forth).
+ *
+ * Upon reception of a notification message from the platform the SCMI RX ISR
+ * passes the received message payload and some ancillary information (including
+ * an arrival timestamp in nanoseconds) to the core via @scmi_notify() which
+ * pushes the event-data itself on a protocol-dedicated kfifo queue for further
+ * deferred processing as specified in @scmi_events_dispatcher().
+ *
+ * Each protocol has it own dedicated work_struct and worker which, once kicked
+ * by the ISR, takes care to empty its own dedicated queue, deliverying the
+ * queued items into the proper notification-chain: notifications processing can
+ * proceed concurrently on distinct workers only between events belonging to
+ * different protocols while delivery of events within the same protocol is
+ * still strictly sequentially ordered by time of arrival.
+ *
+ * Events' information is then extracted from the SCMI Notification messages and
+ * conveyed, converted into a custom per-event report struct, as the void *data
+ * param to the user callback provided by the registered notifier_block, so that
+ * from the user perspective his callback will look invoked like:
+ *
+ * int user_cb(struct notifier_block *nb, unsigned long event_id, void *report)
+ *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -64,6 +85,7 @@ 
 #include <linux/scmi_protocol.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include "notify.h"
 
@@ -143,6 +165,8 @@ 
 #define REVT_NOTIFY_ENABLE(revt, ...)	\
 	((revt)->proto->ops->set_notify_enabled((revt)->proto->ni->handle,     \
 						__VA_ARGS__))
+#define REVT_FILL_REPORT(revt, ...)	\
+	((revt)->proto->ops->fill_custom_report(__VA_ARGS__))
 
 struct scmi_registered_protocol_events_desc;
 
@@ -158,6 +182,7 @@  struct scmi_registered_protocol_events_desc;
  *		 and protocols are allowed to register their supported events
  * @enabled: A flag to indicate events can be enabled and start flowing
  * @init_work: A work item to perform final initializations of pending handlers
+ * @notify_wq: A reference to the allocated Kernel cmwq
  * @pending_mtx: A mutex to protect @pending_events_handlers
  * @registered_protocols: An statically allocated array containing pointers to
  *			  all the registered protocol-level specific information
@@ -173,6 +198,8 @@  struct scmi_notify_instance {
 
 	struct work_struct				init_work;
 
+	struct workqueue_struct				*notify_wq;
+
 	struct mutex					pending_mtx;
 	struct scmi_registered_protocol_events_desc	**registered_protocols;
 	DECLARE_HASHTABLE(pending_events_handlers, 8);
@@ -186,11 +213,15 @@  struct scmi_notify_instance {
  * @sz: Size in bytes of the related kfifo
  * @qbuf: Pre-allocated buffer of @sz bytes to be used by the kfifo
  * @kfifo: A dedicated Kernel kfifo descriptor
+ * @notify_work: A custom work item bound to this queue
+ * @wq: A reference to the associated workqueue
  */
 struct events_queue {
 	size_t				sz;
 	u8				*qbuf;
 	struct kfifo			kfifo;
+	struct work_struct		notify_work;
+	struct workqueue_struct		*wq;
 };
 
 /**
@@ -316,8 +347,249 @@  struct scmi_event_handler {
 
 #define IS_HNDL_PENDING(hndl)	((hndl)->r_evt == NULL)
 
+static struct scmi_event_handler *
+scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key);
+static void scmi_put_active_handler(struct scmi_notify_instance *ni,
+				    struct scmi_event_handler *hndl);
 static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
 				      struct scmi_event_handler *hndl);
+
+/**
+ * scmi_lookup_and_call_event_chain  - Lookup the proper chain and call it
+ *
+ * @ni: A reference to the notification instance to use
+ * @evt_key: The key to use to lookup the related notification chain
+ * @report: The customized event-specific report to pass down to the callbacks
+ *	    as their *data parameter.
+ */
+static inline void
+scmi_lookup_and_call_event_chain(struct scmi_notify_instance *ni,
+				 u32 evt_key, void *report)
+{
+	int ret;
+	struct scmi_event_handler *hndl;
+
+	/* Here ensure the event handler cannot vanish while using it */
+	hndl = scmi_get_active_handler(ni, evt_key);
+	if (IS_ERR_OR_NULL(hndl))
+		return;
+
+	ret = blocking_notifier_call_chain(&hndl->chain,
+					   KEY_XTRACT_EVT_ID(evt_key),
+					   report);
+	/* Notifiers are NOT supposed to cut the chain ... */
+	WARN_ON_ONCE(ret & NOTIFY_STOP_MASK);
+
+	scmi_put_active_handler(ni, hndl);
+}
+
+/**
+ * scmi_process_event_header  - Dequeue and process an event header
+ *
+ * Read an event header from the protocol queue into the dedicated scratch
+ * buffer and looks for a matching registered event; in case an anomalously
+ * sized read is detected just flush the queue.
+ *
+ * @eq: The queue to use
+ * @pd: The protocol descriptor to use
+ *
+ * Returns:
+ *  - a reference to the matching registered event when found
+ *  - ERR_PTR(-EINVAL) when NO registered event could be found
+ *  - NULL when the queue is empty
+ */
+static inline struct scmi_registered_event *
+scmi_process_event_header(struct events_queue *eq,
+			  struct scmi_registered_protocol_events_desc *pd)
+{
+	unsigned int outs;
+	struct scmi_registered_event *r_evt;
+
+	outs = kfifo_out(&eq->kfifo, pd->eh,
+			 sizeof(struct scmi_event_header));
+	if (!outs)
+		return NULL;
+	if (outs != sizeof(struct scmi_event_header)) {
+		pr_err("SCMI Notifications: corrupted EVT header. Flush.\n");
+		kfifo_reset_out(&eq->kfifo);
+		return NULL;
+	}
+
+	r_evt = SCMI_GET_REVT_FROM_PD(pd, pd->eh->evt_id);
+	if (!r_evt)
+		r_evt = ERR_PTR(-EINVAL);
+
+	return r_evt;
+}
+
+/**
+ * scmi_process_event_payload  - Dequeue and process an event payload
+ *
+ * Read an event payload from the protocol queue into the dedicated scratch
+ * buffer, fills a custom report and then look for matching event handlers and
+ * call them; skip any unknown event (as marked by scmi_process_event_header())
+ * and in case an anomalously sized read is detected just flush the queue.
+ *
+ * @eq: The queue to use
+ * @pd: The protocol descriptor to use
+ * @r_evt: The registered event descriptor to use
+ *
+ * Return: False when the queue is empty
+ */
+static inline bool
+scmi_process_event_payload(struct events_queue *eq,
+			   struct scmi_registered_protocol_events_desc *pd,
+			   struct scmi_registered_event *r_evt)
+{
+	u32 src_id, key;
+	unsigned int outs;
+	void *report = NULL;
+
+	outs = kfifo_out(&eq->kfifo, pd->eh->payld, pd->eh->payld_sz);
+	if (unlikely(!outs))
+		return false;
+
+	/* Any in-flight event has now been officially processed */
+	pd->in_flight = NULL;
+
+	if (unlikely(outs != pd->eh->payld_sz)) {
+		pr_err("SCMI Notifications: corrupted EVT Payload. Flush.\n");
+		kfifo_reset_out(&eq->kfifo);
+		return false;
+	}
+
+	if (IS_ERR(r_evt)) {
+		pr_warn("SCMI Notifications: SKIP UNKNOWN EVT - proto:%X  evt:%d\n",
+			pd->id, pd->eh->evt_id);
+		return true;
+	}
+
+	report = REVT_FILL_REPORT(r_evt, pd->eh->evt_id, pd->eh->timestamp,
+				  pd->eh->payld, pd->eh->payld_sz,
+				  r_evt->report, &src_id);
+	if (!report) {
+		pr_err("SCMI Notifications: Report not available - proto:%X  evt:%d\n",
+		       pd->id, pd->eh->evt_id);
+		return true;
+	}
+
+	/* At first search for a generic ALL src_ids handler... */
+	key = MAKE_ALL_SRCS_KEY(pd->id, pd->eh->evt_id);
+	scmi_lookup_and_call_event_chain(pd->ni, key, report);
+
+	/* ...then search for any specific src_id */
+	key = MAKE_HASH_KEY(pd->id, pd->eh->evt_id, src_id);
+	scmi_lookup_and_call_event_chain(pd->ni, key, report);
+
+	return true;
+}
+
+/**
+ * scmi_events_dispatcher  - Common worker logic for all work items.
+ *
+ *  1. dequeue one pending RX notification (queued in SCMI RX ISR context)
+ *  2. generate a custom event report from the received event message
+ *  3. lookup for any registered ALL_SRC_IDs handler
+ *     - > call the related notification chain passing in the report
+ *  4. lookup for any registered specific SRC_ID handler
+ *     - > call the related notification chain passing in the report
+ *
+ * Note that:
+ * - a dedicated per-protocol kfifo queue is used: in this way an anomalous
+ *   flood of events cannot saturate other protocols' queues.
+ *
+ * - each per-protocol queue is associated to a distinct work_item, which
+ *   means, in turn, that:
+ *   + all protocols can process their dedicated queues concurrently
+ *     (since notify_wq:max_active != 1)
+ *   + anyway at most one worker instance is allowed to run on the same queue
+ *     concurrently: this ensures that we can have only one concurrent
+ *     reader/writer on the associated kfifo, so that we can use it lock-less
+ *
+ * @work: The work item to use, which is associated to a dedicated events_queue
+ */
+static void scmi_events_dispatcher(struct work_struct *work)
+{
+	struct events_queue *eq;
+	struct scmi_registered_protocol_events_desc *pd;
+	struct scmi_registered_event *r_evt;
+
+	eq = container_of(work, struct events_queue, notify_work);
+	pd = container_of(eq, struct scmi_registered_protocol_events_desc,
+			  equeue);
+	/*
+	 * In order to keep the queue lock-less and the number of memcopies
+	 * to the bare minimum needed, the dispatcher accounts for the
+	 * possibility of per-protocol in-flight events: i.e. an event whose
+	 * reception could end up being split across two subsequent runs of this
+	 * worker, first the header, then the payload.
+	 */
+	do {
+		if (likely(!pd->in_flight)) {
+			r_evt = scmi_process_event_header(eq, pd);
+			if (!r_evt)
+				break;
+			pd->in_flight = r_evt;
+		} else {
+			r_evt = pd->in_flight;
+		}
+	} while (scmi_process_event_payload(eq, pd, r_evt));
+}
+
+/**
+ * scmi_notify  - Queues a notification for further deferred processing
+ *
+ * This is called in interrupt context to queue a received event for
+ * deferred processing.
+ *
+ * @handle: The handle identifying the platform instance from which the
+ *	    dispatched event is generated
+ * @proto_id: Protocol ID
+ * @evt_id: Event ID (msgID)
+ * @buf: Event Message Payload (without the header)
+ * @len: Event Message Payload size
+ * @ts: RX Timestamp in nanoseconds (boottime)
+ *
+ * Return: 0 on Success
+ */
+int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
+		const void *buf, size_t len, u64 ts)
+{
+	struct scmi_registered_event *r_evt;
+	struct scmi_event_header eh;
+	struct scmi_notify_instance *ni = handle->notify_priv;
+
+	/* Ensure atomic value is updated */
+	smp_mb__before_atomic();
+	if (unlikely(!atomic_read(&ni->enabled)))
+		return 0;
+
+	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
+	if (unlikely(!r_evt))
+		return -EINVAL;
+
+	if (unlikely(len > r_evt->evt->max_payld_sz)) {
+		pr_err("SCMI Notifications: discard badly sized message\n");
+		return -EINVAL;
+	}
+	if (unlikely(kfifo_avail(&r_evt->proto->equeue.kfifo) <
+		     sizeof(eh) + len)) {
+		pr_warn("SCMI Notifications: queue full dropping proto_id:%d  evt_id:%d  ts:%lld\n",
+			proto_id, evt_id, ts);
+		return -ENOMEM;
+	}
+
+	eh.timestamp = ts;
+	eh.evt_id = evt_id;
+	eh.payld_sz = len;
+	kfifo_in(&r_evt->proto->equeue.kfifo, &eh, sizeof(eh));
+	kfifo_in(&r_evt->proto->equeue.kfifo, buf, len);
+	queue_work(r_evt->proto->equeue.wq,
+		   &r_evt->proto->equeue.notify_work);
+
+	return 0;
+}
+
 /**
  * scmi_initialize_events_queue  - Allocate/Initialize a kfifo buffer
  *
@@ -332,12 +604,21 @@  static void scmi_put_handler_unlocked(struct scmi_notify_instance *ni,
 static int scmi_initialize_events_queue(struct scmi_notify_instance *ni,
 					struct events_queue *equeue, size_t sz)
 {
+	int ret = 0;
+
 	equeue->qbuf = devm_kzalloc(ni->handle->dev, sz, GFP_KERNEL);
 	if (!equeue->qbuf)
 		return -ENOMEM;
 	equeue->sz = sz;
 
-	return kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
+	ret = kfifo_init(&equeue->kfifo, equeue->qbuf, equeue->sz);
+	if (ret)
+		return ret;
+
+	INIT_WORK(&equeue->notify_work, scmi_events_dispatcher);
+	equeue->wq = ni->notify_wq;
+
+	return ret;
 }
 
 /**
@@ -740,6 +1021,38 @@  scmi_get_or_create_handler(struct scmi_notify_instance *ni, u32 evt_key)
 	return __scmi_event_handler_get_ops(ni, evt_key, true);
 }
 
+/**
+ * scmi_get_active_handler  - Helper to get active handlers only
+ *
+ * Search for the desired handler matching the key only in the per-protocol
+ * table of registered handlers: this is called only from the dispatching path
+ * so want to be as quick as possible and do not care about pending.
+ *
+ * @ni: A reference to the notification instance to use
+ * @evt_key: The event key to use
+ *
+ * Return: A properly refcounted active handler
+ */
+static struct scmi_event_handler *
+scmi_get_active_handler(struct scmi_notify_instance *ni, u32 evt_key)
+{
+	struct scmi_registered_event *r_evt;
+	struct scmi_event_handler *hndl = NULL;
+
+	r_evt = SCMI_GET_REVT(ni, KEY_XTRACT_PROTO_ID(evt_key),
+			      KEY_XTRACT_EVT_ID(evt_key));
+	if (likely(r_evt)) {
+		mutex_lock(&r_evt->proto->registered_mtx);
+		hndl = KEY_FIND(r_evt->proto->registered_events_handlers,
+				hndl, evt_key);
+		if (likely(hndl))
+			refcount_inc(&hndl->users);
+		mutex_unlock(&r_evt->proto->registered_mtx);
+	}
+
+	return hndl;
+}
+
 /**
  * __scmi_enable_evt  - Enable/disable events generation
  *
@@ -861,6 +1174,16 @@  static void scmi_put_handler(struct scmi_notify_instance *ni,
 	mutex_unlock(&ni->pending_mtx);
 }
 
+static void scmi_put_active_handler(struct scmi_notify_instance *ni,
+					  struct scmi_event_handler *hndl)
+{
+	struct scmi_registered_event *r_evt = hndl->r_evt;
+
+	mutex_lock(&r_evt->proto->registered_mtx);
+	scmi_put_handler_unlocked(ni, hndl);
+	mutex_unlock(&r_evt->proto->registered_mtx);
+}
+
 /**
  * scmi_event_handler_enable_events  - Enable events associated to an handler
  *
@@ -1087,6 +1410,12 @@  int scmi_notification_init(struct scmi_handle *handle)
 	ni->gid = gid;
 	ni->handle = handle;
 
+	ni->notify_wq = alloc_workqueue("scmi_notify",
+					WQ_UNBOUND | WQ_FREEZABLE | WQ_SYSFS,
+					0);
+	if (!ni->notify_wq)
+		goto err;
+
 	ni->registered_protocols = devm_kcalloc(handle->dev, SCMI_MAX_PROTO,
 						sizeof(char *), GFP_KERNEL);
 	if (!ni->registered_protocols)
@@ -1133,6 +1462,9 @@  void scmi_notification_exit(struct scmi_handle *handle)
 	/* Ensure atomic values are updated */
 	smp_mb__after_atomic();
 
+	/* Destroy while letting pending work complete */
+	destroy_workqueue(ni->notify_wq);
+
 	devres_release_group(ni->handle->dev, ni->gid);
 
 	pr_info("SCMI Notifications Core Shutdown.\n");
diff --git a/drivers/firmware/arm_scmi/notify.h b/drivers/firmware/arm_scmi/notify.h
index f765acda2311..6cd386649d5a 100644
--- a/drivers/firmware/arm_scmi/notify.h
+++ b/drivers/firmware/arm_scmi/notify.h
@@ -51,10 +51,17 @@  struct scmi_event {
  *			using the proper custom protocol commands.
  *			Return true if at least one the required src_id
  *			has been successfully enabled/disabled
+ * @fill_custom_report: fills a custom event report from the provided
+ *			event message payld identifying the event
+ *			specific src_id.
+ *			Return NULL on failure otherwise @report now fully
+ *			populated
  */
 struct scmi_protocol_event_ops {
 	bool (*set_notify_enabled)(const struct scmi_handle *handle,
 				   u8 evt_id, u32 src_id, bool enabled);
+	void *(*fill_custom_report)(u8 evt_id, u64 timestamp, const void *payld,
+				    size_t payld_sz, void *report, u32 *src_id);
 };
 
 int scmi_notification_init(struct scmi_handle *handle);
@@ -65,5 +72,7 @@  int scmi_register_protocol_events(const struct scmi_handle *handle,
 				  const struct scmi_protocol_event_ops *ops,
 				  const struct scmi_event *evt, int num_events,
 				  int num_sources);
+int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
+		const void *buf, size_t len, u64 ts);
 
 #endif /* _SCMI_NOTIFY_H */