diff mbox series

[2/7] cxl/mbox: Add background cmd handling machinery

Message ID 20230421092321.12741-3-dave@stgolabs.net
State New, archived
Headers show
Series cxl: Background cmds and device sanitation | expand

Commit Message

Davidlohr Bueso April 21, 2023, 9:23 a.m. UTC
This adds support for handling background operations, as defined in
the CXL 3.0 spec. Commands that can take too long (over ~2 seconds)
can run in the background asynchronously (to the hardware).

The driver will deal with such commands synchronously, blocking all
other incoming commands for a specified period of time, allowing
time-slicing the command such that the caller can send incremental
requests to avoid monopolizing the driver/device. This approach
makes the code simpler, where any out of sync (timeout) between the
driver and hardware is just disregarded as an invalid state until
the next successful submission.

On devices where mbox interrupts are supported, this will still use
a poller that will wakeup in the specified wait intervals. The irq
handler will simply awake a blocked cmd, which is also safe vs a
task that is either waking (timing out) or already awoken. Similarly
any irq setup error during the probing falls back to polling, thus
avoids unnecessarily erroring out.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 drivers/cxl/core/mbox.c |   3 +-
 drivers/cxl/cxl.h       |   7 +++
 drivers/cxl/cxlmem.h    |   5 ++
 drivers/cxl/pci.c       | 104 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 117 insertions(+), 2 deletions(-)

Comments

Li, Ming4 April 23, 2023, 7:54 a.m. UTC | #1
On 4/21/2023 5:23 PM, Davidlohr Bueso wrote:
> This adds support for handling background operations, as defined in
> the CXL 3.0 spec. Commands that can take too long (over ~2 seconds)
> can run in the background asynchronously (to the hardware).
> 
> The driver will deal with such commands synchronously, blocking all
> other incoming commands for a specified period of time, allowing
> time-slicing the command such that the caller can send incremental
> requests to avoid monopolizing the driver/device. This approach
> makes the code simpler, where any out of sync (timeout) between the
> driver and hardware is just disregarded as an invalid state until
> the next successful submission.
> 
> On devices where mbox interrupts are supported, this will still use
> a poller that will wakeup in the specified wait intervals. The irq
> handler will simply awake a blocked cmd, which is also safe vs a
> task that is either waking (timing out) or already awoken. Similarly
> any irq setup error during the probing falls back to polling, thus
> avoids unnecessarily erroring out.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---

......

> +static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
> +{
> +	u64 reg;
> +
> +	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
> +}

should using a MACRO to define '100' be better?

> +
> +static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
> +{
> +	struct cxl_dev_state *cxlds = id;
> +
> +	/* spurious or raced with hw? */
> +	if (!cxl_mbox_background_complete(cxlds)) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		dev_warn(&pdev->dev,
> +			 "Mailbox background operation IRQ but incomplete\n");
> +		goto done;
> +	}
> +
> +	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
> +	wake_up(&mbox_wait);
> +done:
> +	return IRQ_HANDLED;
> +}
> +
>  /**
>   * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
>   * @cxlds: The device state to communicate with.
> @@ -178,7 +206,59 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
>  	mbox_cmd->return_code =
>  		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
>  
> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
> +	/*
> +	 * Handle the background command in a synchronous manner.
> +	 *
> +	 * All other mailbox commands will serialize/queue on the mbox_mutex,
> +	 * which we currently hold. Furthermore this also guarantees that
> +	 * cxl_mbox_background_complete() checks are safe amongst each other,
> +	 * in that no new bg operation can occur in between.
> +	 *
> +	 * Background operations are timesliced in accordance with the nature
> +	 * of the command. In the event of timeout, the mailbox state is
> +	 * indeterminate until the next successful command submission and the
> +	 * driver can get back in sync with the hardware state.
> +	 */
> +	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
> +		u64 bg_status_reg;
> +		int i;
> +
> +		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
> +			mbox_cmd->opcode);
> +
> +		for (i = 0; i < mbox_cmd->poll_count; i++) {
> +			int ret = wait_event_interruptible_timeout(
> +				mbox_wait, cxl_mbox_background_complete(cxlds),
> +				msecs_to_jiffies(mbox_cmd->poll_interval));
> +			if (ret > 0)
> +				break;
> +
> +			/* interrupted by a signal */
> +			if (ret < 0)
> +				return ret;
> +		}
> +
> +		if (!cxl_mbox_background_complete(cxlds)) {
> +			u64 md_status =
> +				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
> +
> +			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
> +				    "background timeout");
> +			return -ETIMEDOUT;
> +		}
> +
> +		bg_status_reg = readq(cxlds->regs.mbox +
> +				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +		mbox_cmd->return_code =
> +			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
> +				  bg_status_reg);
> +		dev_dbg(dev,
> +			"Mailbox background operation (0x%04x) completed\n",
> +			mbox_cmd->opcode);
> +	}
> +
> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {
>  		dev_dbg(dev, "Mailbox operation had an error: %s\n",
>  			cxl_mbox_cmd_rc2str(mbox_cmd));
>  		return 0; /* completed but caller must check return_code */

why does here only handle failure cases for non-background command? Maybe I missed something, I think that we need to do same thing here for background command.

Thanks
Ming

> @@ -224,6 +304,7 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>  	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
>  	unsigned long timeout;
>  	u64 md_status;
> +	int rc, irq;
>  
>  	timeout = jiffies + mbox_ready_timeout * HZ;
>  	do {
> @@ -272,6 +353,27 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>  	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
>  		cxlds->payload_size);
>  
> +	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		irq = pci_irq_vector(pdev,
> +			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
> +		if (irq < 0)
> +			goto mbox_poll;
> +
> +		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
> +				      IRQF_SHARED, "mailbox", cxlds);
> +		if (rc)
> +			goto mbox_poll;
> +
> +		writel(CXLDEV_MBOX_CTRL_BG_CMD_IRQ,
> +		       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
> +
> +		return 0;
> +	}
> +
> +mbox_poll:
> +	dev_dbg(cxlds->dev, "Mailbox interrupts are unsupported");
>  	return 0;
>  }
>
Davidlohr Bueso April 23, 2023, 8:51 p.m. UTC | #2
On Sun, 23 Apr 2023, Li, Ming wrote:

>On 4/21/2023 5:23 PM, Davidlohr Bueso wrote:
>> +static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
>> +{
>> +	u64 reg;
>> +
>> +	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
>> +	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
>> +}
>
>should using a MACRO to define '100' be better?

Given that an abstraction is already being provided, this feels like an
overkill. Plus pct == 100 is pretty self descriptive.

>> +
>> +static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
>> +{
>> +	struct cxl_dev_state *cxlds = id;
>> +
>> +	/* spurious or raced with hw? */
>> +	if (!cxl_mbox_background_complete(cxlds)) {

While at it, this probably wants to be unlikely().

>> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
>> +
>> +		dev_warn(&pdev->dev,
>> +			 "Mailbox background operation IRQ but incomplete\n");
>> +		goto done;
>> +	}
>> +
>> +	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
>> +	wake_up(&mbox_wait);
>> +done:
>> +	return IRQ_HANDLED;
>> +}
>> +
>>  /**
>>   * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
>>   * @cxlds: The device state to communicate with.
>> @@ -178,7 +206,59 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
>>	mbox_cmd->return_code =
>>		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
>>
>> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
>> +	/*
>> +	 * Handle the background command in a synchronous manner.
>> +	 *
>> +	 * All other mailbox commands will serialize/queue on the mbox_mutex,
>> +	 * which we currently hold. Furthermore this also guarantees that
>> +	 * cxl_mbox_background_complete() checks are safe amongst each other,
>> +	 * in that no new bg operation can occur in between.
>> +	 *
>> +	 * Background operations are timesliced in accordance with the nature
>> +	 * of the command. In the event of timeout, the mailbox state is
>> +	 * indeterminate until the next successful command submission and the
>> +	 * driver can get back in sync with the hardware state.
>> +	 */
>> +	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
>> +		u64 bg_status_reg;
>> +		int i;
>> +
>> +		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
>> +			mbox_cmd->opcode);
>> +
>> +		for (i = 0; i < mbox_cmd->poll_count; i++) {
>> +			int ret = wait_event_interruptible_timeout(
>> +				mbox_wait, cxl_mbox_background_complete(cxlds),
>> +				msecs_to_jiffies(mbox_cmd->poll_interval));
>> +			if (ret > 0)
>> +				break;
>> +
>> +			/* interrupted by a signal */
>> +			if (ret < 0)
>> +				return ret;
>> +		}
>> +
>> +		if (!cxl_mbox_background_complete(cxlds)) {
>> +			u64 md_status =
>> +				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
>> +
>> +			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
>> +				    "background timeout");
>> +			return -ETIMEDOUT;
>> +		}
>> +
>> +		bg_status_reg = readq(cxlds->regs.mbox +
>> +				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
>> +		mbox_cmd->return_code =
>> +			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
>> +				  bg_status_reg);
>> +		dev_dbg(dev,
>> +			"Mailbox background operation (0x%04x) completed\n",
>> +			mbox_cmd->opcode);
>> +	}
>> +
>> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
>> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {
>>		dev_dbg(dev, "Mailbox operation had an error: %s\n",
>>			cxl_mbox_cmd_rc2str(mbox_cmd));
>>		return 0; /* completed but caller must check return_code */
>
>why does here only handle failure cases for non-background command? Maybe I missed something, I think that we need to do same thing here for background command.

Good point. Checking for background rc here is bogus and confusing because
this is a synchronous path and will never be true. I'll get rid of it, while
harmless it is semantically wrong. The same check is however necessary for
sanitation later in the cxl_internal_send_cmd() layer.

Thanks,
Davidlohr
Dave Jiang April 28, 2023, 4:21 p.m. UTC | #3
On 4/21/23 2:23 AM, Davidlohr Bueso wrote:
> This adds support for handling background operations, as defined in
> the CXL 3.0 spec. Commands that can take too long (over ~2 seconds)
> can run in the background asynchronously (to the hardware).
> 
> The driver will deal with such commands synchronously, blocking all
> other incoming commands for a specified period of time, allowing
> time-slicing the command such that the caller can send incremental
> requests to avoid monopolizing the driver/device. This approach
> makes the code simpler, where any out of sync (timeout) between the
> driver and hardware is just disregarded as an invalid state until
> the next successful submission.
> 
> On devices where mbox interrupts are supported, this will still use
> a poller that will wakeup in the specified wait intervals. The irq
> handler will simply awake a blocked cmd, which is also safe vs a
> task that is either waking (timing out) or already awoken. Similarly
> any irq setup error during the probing falls back to polling, thus
> avoids unnecessarily erroring out.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>   drivers/cxl/core/mbox.c |   3 +-
>   drivers/cxl/cxl.h       |   7 +++
>   drivers/cxl/cxlmem.h    |   5 ++
>   drivers/cxl/pci.c       | 104 +++++++++++++++++++++++++++++++++++++++-
>   4 files changed, 117 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 6198637cb0bb..cde7270c6037 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -180,7 +180,8 @@ int cxl_internal_send_cmd(struct cxl_dev_state *cxlds,
>   	if (rc)
>   		return rc;
>   
> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS)
> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND)
>   		return cxl_mbox_cmd_rc2errno(mbox_cmd);
>   
>   	if (!out_size)
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 044a92d9813e..72731a896f58 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -176,14 +176,21 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
>   /* CXL 2.0 8.2.8.4 Mailbox Registers */
>   #define CXLDEV_MBOX_CAPS_OFFSET 0x00
>   #define   CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> +#define   CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
> +#define   CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)
>   #define CXLDEV_MBOX_CTRL_OFFSET 0x04
>   #define   CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
> +#define   CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
>   #define CXLDEV_MBOX_CMD_OFFSET 0x08
>   #define   CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
>   #define   CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
>   #define CXLDEV_MBOX_STATUS_OFFSET 0x10
> +#define   CXLDEV_MBOX_STATUS_BG_CMD BIT(0)
>   #define   CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
>   #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)
>   #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
>   
>   /*
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 090acebba4fa..8c3302fc7738 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -108,6 +108,9 @@ static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
>    *            variable sized output commands, it tells the exact number of bytes
>    *            written.
>    * @min_out: (input) internal command output payload size validation
> + * @poll_count: (input)  Number of timeouts to attempt.
> + * @poll_interval: (input) Number of ms between mailbox background command
> + *                 polling intervals timeouts.
>    * @return_code: (output) Error code returned from hardware.
>    *
>    * This is the primary mechanism used to send commands to the hardware.
> @@ -123,6 +126,8 @@ struct cxl_mbox_cmd {
>   	size_t size_in;
>   	size_t size_out;
>   	size_t min_out;
> +	int poll_count;
> +	int poll_interval;
>   	u16 return_code;
>   };
>   
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 39b829a29f6c..aa1bb74a52a1 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -51,6 +51,7 @@
>   static unsigned short mbox_ready_timeout = 60;
>   module_param(mbox_ready_timeout, ushort, 0644);
>   MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
> +static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);

I apologize if I've asked this before and you've already answered. What 
is the reason the mbox_wait a module global wq instead of a per device 
wq? Just thinking when you tear down a device, you may want to wake all 
pending for that device to clean up.

>   
>   static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
>   {
> @@ -85,6 +86,33 @@ static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
>   			    status & CXLMDEV_DEV_FATAL ? " fatal" : "",        \
>   			    status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
>   
> +static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
> +{
> +	u64 reg;
> +
> +	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
> +}
> +
> +static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
> +{
> +	struct cxl_dev_state *cxlds = id;
> +
> +	/* spurious or raced with hw? */
> +	if (!cxl_mbox_background_complete(cxlds)) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		dev_warn(&pdev->dev,
> +			 "Mailbox background operation IRQ but incomplete\n");
> +		goto done;
> +	}
> +
> +	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
> +	wake_up(&mbox_wait);
> +done:
> +	return IRQ_HANDLED;
> +}
> +
>   /**
>    * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
>    * @cxlds: The device state to communicate with.
> @@ -178,7 +206,59 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
>   	mbox_cmd->return_code =
>   		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
>   
> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
> +	/*
> +	 * Handle the background command in a synchronous manner.
> +	 *
> +	 * All other mailbox commands will serialize/queue on the mbox_mutex,
> +	 * which we currently hold. Furthermore this also guarantees that
> +	 * cxl_mbox_background_complete() checks are safe amongst each other,
> +	 * in that no new bg operation can occur in between.
> +	 *
> +	 * Background operations are timesliced in accordance with the nature
> +	 * of the command. In the event of timeout, the mailbox state is
> +	 * indeterminate until the next successful command submission and the
> +	 * driver can get back in sync with the hardware state.
> +	 */
> +	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
> +		u64 bg_status_reg;
> +		int i;
> +
> +		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
> +			mbox_cmd->opcode);
> +
> +		for (i = 0; i < mbox_cmd->poll_count; i++) {
> +			int ret = wait_event_interruptible_timeout(
> +				mbox_wait, cxl_mbox_background_complete(cxlds),
> +				msecs_to_jiffies(mbox_cmd->poll_interval));
> +			if (ret > 0)
> +				break;
> +
> +			/* interrupted by a signal */
> +			if (ret < 0)
> +				return ret;
> +		}
> +
> +		if (!cxl_mbox_background_complete(cxlds)) {
> +			u64 md_status =
> +				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
> +
> +			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
> +				    "background timeout");
> +			return -ETIMEDOUT;
> +		}
> +
> +		bg_status_reg = readq(cxlds->regs.mbox +
> +				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +		mbox_cmd->return_code =
> +			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
> +				  bg_status_reg);
> +		dev_dbg(dev,
> +			"Mailbox background operation (0x%04x) completed\n",
> +			mbox_cmd->opcode);
> +	}
> +
> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {
>   		dev_dbg(dev, "Mailbox operation had an error: %s\n",
>   			cxl_mbox_cmd_rc2str(mbox_cmd));
>   		return 0; /* completed but caller must check return_code */
> @@ -224,6 +304,7 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>   	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
>   	unsigned long timeout;
>   	u64 md_status;
> +	int rc, irq;
>   
>   	timeout = jiffies + mbox_ready_timeout * HZ;
>   	do {
> @@ -272,6 +353,27 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>   	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
>   		cxlds->payload_size);
>   
> +	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		irq = pci_irq_vector(pdev,
> +			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
> +		if (irq < 0)
> +			goto mbox_poll;
> +
> +		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
> +				      IRQF_SHARED, "mailbox", cxlds);
> +		if (rc)
> +			goto mbox_poll;
> +
> +		writel(CXLDEV_MBOX_CTRL_BG_CMD_IRQ,
> +		       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
> +
> +		return 0;
> +	}
> +
> +mbox_poll:
> +	dev_dbg(cxlds->dev, "Mailbox interrupts are unsupported");
>   	return 0;
>   }
>
Davidlohr Bueso April 28, 2023, 5:18 p.m. UTC | #4
On Fri, 28 Apr 2023, Dave Jiang wrote:

>>+static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
>
>I apologize if I've asked this before and you've already answered.
>What is the reason the mbox_wait a module global wq instead of a per
>device wq? Just thinking when you tear down a device, you may want to
>wake all pending for that device to clean up.

Yes, I agree that doing the wait per-device is better, and not only
for tear down reasons. By doing it globally, the queue reflects waits
from different devices, but the driver really has no control about
the order of the wait of each device is, so upon a blind wake_up(),
it could perfectly well be that the first node in the waitq is not
the node for that device.

This all goes away with per-device, with the note that because of the
mbox_mutex there never will only ever be a single waiter, so no concept
of a queue really.

Thanks,
Davidlohr
Dave Jiang April 28, 2023, 9:04 p.m. UTC | #5
On 4/28/23 10:18 AM, Davidlohr Bueso wrote:
> On Fri, 28 Apr 2023, Dave Jiang wrote:
> 
>>> +static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
>>
>> I apologize if I've asked this before and you've already answered.
>> What is the reason the mbox_wait a module global wq instead of a per
>> device wq? Just thinking when you tear down a device, you may want to
>> wake all pending for that device to clean up.
> 
> Yes, I agree that doing the wait per-device is better, and not only
> for tear down reasons. By doing it globally, the queue reflects waits
> from different devices, but the driver really has no control about
> the order of the wait of each device is, so upon a blind wake_up(),
> it could perfectly well be that the first node in the waitq is not
> the node for that device.
> 
> This all goes away with per-device, with the note that because of the
> mbox_mutex there never will only ever be a single waiter, so no concept
> of a queue really.

If there's only a single waiter then I think a 'completion' can be used 
instead right?

> 
> Thanks,
> Davidlohr
Davidlohr Bueso April 28, 2023, 10:03 p.m. UTC | #6
On Fri, 28 Apr 2023, Dave Jiang wrote:

>On 4/28/23 10:18 AM, Davidlohr Bueso wrote:
>>On Fri, 28 Apr 2023, Dave Jiang wrote:
>>
>>>>+static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
>>>
>>>I apologize if I've asked this before and you've already answered.
>>>What is the reason the mbox_wait a module global wq instead of a per
>>>device wq? Just thinking when you tear down a device, you may want to
>>>wake all pending for that device to clean up.
>>
>>Yes, I agree that doing the wait per-device is better, and not only
>>for tear down reasons. By doing it globally, the queue reflects waits
>>from different devices, but the driver really has no control about
>>the order of the wait of each device is, so upon a blind wake_up(),
>>it could perfectly well be that the first node in the waitq is not
>>the node for that device.
>>
>>This all goes away with per-device, with the note that because of the
>>mbox_mutex there never will only ever be a single waiter, so no concept
>>of a queue really.
>
>If there's only a single waiter then I think a 'completion' can be
>used instead right?

Well completions still use (simple) wq underneath. What we really
want is rcuwait semantics but there is currently no support for
schedule timeouts. INTERRUPTIBLE sleep was added for new users in
the past, so it could be doable.

Thanks,
Davidlohr
Davidlohr Bueso May 1, 2023, 3:56 p.m. UTC | #7
On Fri, 28 Apr 2023, Davidlohr Bueso wrote:

>On Fri, 28 Apr 2023, Dave Jiang wrote:
>
>>On 4/28/23 10:18 AM, Davidlohr Bueso wrote:
>>>On Fri, 28 Apr 2023, Dave Jiang wrote:
>>>
>>>>>+static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
>>>>
>>>>I apologize if I've asked this before and you've already answered.
>>>>What is the reason the mbox_wait a module global wq instead of a per
>>>>device wq? Just thinking when you tear down a device, you may want to
>>>>wake all pending for that device to clean up.
>>>
>>>Yes, I agree that doing the wait per-device is better, and not only
>>>for tear down reasons. By doing it globally, the queue reflects waits
>>>from different devices, but the driver really has no control about
>>>the order of the wait of each device is, so upon a blind wake_up(),
>>>it could perfectly well be that the first node in the waitq is not
>>>the node for that device.
>>>
>>>This all goes away with per-device, with the note that because of the
>>>mbox_mutex there never will only ever be a single waiter, so no concept
>>>of a queue really.
>>
>>If there's only a single waiter then I think a 'completion' can be
>>used instead right?
>
>Well completions still use (simple) wq underneath. What we really
>want is rcuwait semantics but there is currently no support for
>schedule timeouts. INTERRUPTIBLE sleep was added for new users in
>the past, so it could be doable.

Something like this, but I guess we could use completions for now and
convert to rcuwait later if we don't want to have to depend on sched
bits for this patch - and I'm sure there are a few other users out there
abusing queued wait for similar semantics that could make use of it.


diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 8052d34da782..9e4759de228b 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -49,9 +49,9 @@ static inline void prepare_to_rcuwait(struct rcuwait *w)

  extern void finish_rcuwait(struct rcuwait *w);

-#define rcuwait_wait_event(w, condition, state)				\
+#define ___rcuwait_wait_event(w, condition, state, ret, cmd)		\
  ({									\
-	int __ret = 0;							\
+	long __ret = ret;						\
	prepare_to_rcuwait(w);						\
	for (;;) {							\
		/*							\
@@ -67,10 +67,27 @@ extern void finish_rcuwait(struct rcuwait *w);
			break;						\
		}							\
									\
-		schedule();						\
+		cmd;							\
	}								\
	finish_rcuwait(w);						\
	__ret;								\
  })

+#define rcuwait_wait_event(w, condition, state)				\
+	___rcuwait_wait_event(w, condition, state, 0, schedule())
+
+#define __rcuwait_wait_event_timeout(w, condition, state, timeout)	\
+	___rcuwait_wait_event(w, ___wait_cond_timeout(condition),	\
+			      state, timeout,				\
+			      __ret = schedule_timeout(__ret))
+
+#define rcuwait_wait_event_timeout(w, condition, state, timeout)	\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __rcuwait_wait_event_timeout(w, condition,	\
+						     timeout);		\
+	__ret;								\
+})
+
  #endif /* _LINUX_RCUWAIT_H_ */
Jonathan Cameron May 11, 2023, 2:23 p.m. UTC | #8
On Fri, 21 Apr 2023 02:23:16 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> This adds support for handling background operations, as defined in
> the CXL 3.0 spec. Commands that can take too long (over ~2 seconds)
> can run in the background asynchronously (to the hardware).
> 
> The driver will deal with such commands synchronously, blocking all
> other incoming commands for a specified period of time, allowing
> time-slicing the command such that the caller can send incremental
> requests to avoid monopolizing the driver/device. This approach
> makes the code simpler, where any out of sync (timeout) between the
> driver and hardware is just disregarded as an invalid state until
> the next successful submission.
> 
> On devices where mbox interrupts are supported, this will still use
> a poller that will wakeup in the specified wait intervals. The irq
> handler will simply awake a blocked cmd, which is also safe vs a
> task that is either waking (timing out) or already awoken. Similarly
> any irq setup error during the probing falls back to polling, thus
> avoids unnecessarily erroring out.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  drivers/cxl/core/mbox.c |   3 +-
>  drivers/cxl/cxl.h       |   7 +++
>  drivers/cxl/cxlmem.h    |   5 ++
>  drivers/cxl/pci.c       | 104 +++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 117 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 6198637cb0bb..cde7270c6037 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -180,7 +180,8 @@ int cxl_internal_send_cmd(struct cxl_dev_state *cxlds,
>  	if (rc)
>  		return rc;
>  
> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS)
> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND)
>  		return cxl_mbox_cmd_rc2errno(mbox_cmd);
>  
>  	if (!out_size)
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 044a92d9813e..72731a896f58 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -176,14 +176,21 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
>  /* CXL 2.0 8.2.8.4 Mailbox Registers */
>  #define CXLDEV_MBOX_CAPS_OFFSET 0x00
>  #define   CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
> +#define   CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
> +#define   CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)

Numeric order of bits probably makes more sense. So move this up one line.

>  #define CXLDEV_MBOX_CTRL_OFFSET 0x04
>  #define   CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
> +#define   CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
>  #define CXLDEV_MBOX_CMD_OFFSET 0x08
>  #define   CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
>  #define   CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
>  #define CXLDEV_MBOX_STATUS_OFFSET 0x10
> +#define   CXLDEV_MBOX_STATUS_BG_CMD BIT(0)

Hmm. Oddly field is called Background Operation.  Still that is then
described as a command so I guess this is a reasonable bit of consolidation
of naming.

>  #define   CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
>  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)

Obviously there are line length disadvantages to includes the status part
in the field names and there isn't a BG_CMD_COMMAND register thankfully
but I still find the absence of status a bit inconsistent / confusing.
initial instinct is this a field called OP_CODE in a register called
BG_CMD_COMMAND

> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)

It might be nice to to do 'something' with the Vendor Specific extended
status even if it is just put it in a dev_dbg() for anyone who cares?

>  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
>  
>  /*
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 090acebba4fa..8c3302fc7738 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -108,6 +108,9 @@ static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
>   *            variable sized output commands, it tells the exact number of bytes
>   *            written.
>   * @min_out: (input) internal command output payload size validation
> + * @poll_count: (input)  Number of timeouts to attempt.
> + * @poll_interval: (input) Number of ms between mailbox background command
> + *                 polling intervals timeouts.

name it poll_interval_ms: and the units become obvious everywhere without needing
comments.  Good for the lazy / forgetful reviewer if nothing else...

>   * @return_code: (output) Error code returned from hardware.
>   *
>   * This is the primary mechanism used to send commands to the hardware.
> @@ -123,6 +126,8 @@ struct cxl_mbox_cmd {
>  	size_t size_in;
>  	size_t size_out;
>  	size_t min_out;
> +	int poll_count;
> +	int poll_interval;
>  	u16 return_code;
>  };
>  
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 39b829a29f6c..aa1bb74a52a1 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -51,6 +51,7 @@
>  static unsigned short mbox_ready_timeout = 60;
>  module_param(mbox_ready_timeout, ushort, 0644);
>  MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
> +static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);

I see in discussion you are moving to a per device approach so I won't review
that bit on this version.

>  
>  static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
>  {
> @@ -85,6 +86,33 @@ static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
>  			    status & CXLMDEV_DEV_FATAL ? " fatal" : "",        \
>  			    status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
>  
> +static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
> +{
> +	u64 reg;
> +
> +	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;

This is what motivated comment on including _STATUS_ in field names.
I briefly thought you had a field from the wrong register.

> +}
> +
> +static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
> +{
> +	struct cxl_dev_state *cxlds = id;
> +
> +	/* spurious or raced with hw? */

If talking about a race, I'd like a comment that gives more info.
What is the potential hardware race?

I can sort of see polling might have noticed completed command and
launched another one, all before the interrupt actually got handled.
If that's what you were thinking then eat the interrupt without the
scary message.

> +	if (!cxl_mbox_background_complete(cxlds)) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		dev_warn(&pdev->dev,
> +			 "Mailbox background operation IRQ but incomplete\n");
> +		goto done;
> +	}
> +
> +	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
> +	wake_up(&mbox_wait);
> +done:
> +	return IRQ_HANDLED;
> +}
> +
>  /**
>   * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
>   * @cxlds: The device state to communicate with.
> @@ -178,7 +206,59 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
>  	mbox_cmd->return_code =
>  		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
>  
> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
> +	/*
> +	 * Handle the background command in a synchronous manner.
> +	 *
> +	 * All other mailbox commands will serialize/queue on the mbox_mutex,
> +	 * which we currently hold. Furthermore this also guarantees that
> +	 * cxl_mbox_background_complete() checks are safe amongst each other,
> +	 * in that no new bg operation can occur in between.
> +	 *
> +	 * Background operations are timesliced in accordance with the nature
> +	 * of the command. In the event of timeout, the mailbox state is
> +	 * indeterminate until the next successful command submission and the
> +	 * driver can get back in sync with the hardware state.
> +	 */
> +	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
> +		u64 bg_status_reg;
> +		int i;
> +
> +		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
> +			mbox_cmd->opcode);
> +
> +		for (i = 0; i < mbox_cmd->poll_count; i++) {
> +			int ret = wait_event_interruptible_timeout(

We already have an rc floating around in here.  Having ret as well with more limited
scope isn't great for readability.  I think you can just use rc here.

> +				mbox_wait, cxl_mbox_background_complete(cxlds),
> +				msecs_to_jiffies(mbox_cmd->poll_interval));
> +			if (ret > 0)
> +				break;
> +
I'd drop this blank line to keep all the handling of ret in one block of code.
It looks a bit too separate to me otherwise.

> +			/* interrupted by a signal */
> +			if (ret < 0)
> +				return ret;
> +		}
> +
> +		if (!cxl_mbox_background_complete(cxlds)) {
> +			u64 md_status =
> +				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
> +
> +			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
> +				    "background timeout");

Why are we interested in the memory device status at this point? A timeout on background
command isn't really a cxl_cmd_err() in my head at least.

> +			return -ETIMEDOUT;
> +		}
> +
> +		bg_status_reg = readq(cxlds->regs.mbox +
> +				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
> +		mbox_cmd->return_code =
> +			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
> +				  bg_status_reg);
> +		dev_dbg(dev,
> +			"Mailbox background operation (0x%04x) completed\n",
> +			mbox_cmd->opcode);

Here is where I'd like to also log the vendor specific extended status.

> +	}
> +
> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {

You overrode the original return code with the background command return code. So if you get
here and it's still RC_BACKGROUND I think something went wrong.

>  		dev_dbg(dev, "Mailbox operation had an error: %s\n",
>  			cxl_mbox_cmd_rc2str(mbox_cmd));
>  		return 0; /* completed but caller must check return_code */
> @@ -224,6 +304,7 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>  	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
>  	unsigned long timeout;
>  	u64 md_status;
> +	int rc, irq;
>  
>  	timeout = jiffies + mbox_ready_timeout * HZ;
>  	do {
> @@ -272,6 +353,27 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>  	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
>  		cxlds->payload_size);
>  
> +	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +
> +		irq = pci_irq_vector(pdev,
> +			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
> +		if (irq < 0)
> +			goto mbox_poll;
> +
> +		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
> +				      IRQF_SHARED, "mailbox", cxlds);
> +		if (rc)
> +			goto mbox_poll;

Hmm. The old argument of whether to carry on when something unexpected happens.
I'd argue in this case at least and possibly the one above we should fail
hard as we really want to know if interrupt allocations are failing, not just
fall back quietly to polling.  I'd rather fail to probe the driver in a fashion
that lets us figure out what broke.

> +
> +		writel(CXLDEV_MBOX_CTRL_BG_CMD_IRQ,
> +		       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
> +
> +		return 0;
> +	}
> +
> +mbox_poll:
> +	dev_dbg(cxlds->dev, "Mailbox interrupts are unsupported");
>  	return 0;
>  }
>
Davidlohr Bueso May 11, 2023, 4:04 p.m. UTC | #9
On Thu, 11 May 2023, Jonathan Cameron wrote:

>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index 044a92d9813e..72731a896f58 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -176,14 +176,21 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
>>  /* CXL 2.0 8.2.8.4 Mailbox Registers */
>>  #define CXLDEV_MBOX_CAPS_OFFSET 0x00
>>  #define   CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
>> +#define   CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
>> +#define   CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)
>
>Numeric order of bits probably makes more sense. So move this up one line.

Sure.

>
>>  #define CXLDEV_MBOX_CTRL_OFFSET 0x04
>>  #define   CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
>> +#define   CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
>>  #define CXLDEV_MBOX_CMD_OFFSET 0x08
>>  #define   CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
>>  #define   CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
>>  #define CXLDEV_MBOX_STATUS_OFFSET 0x10
>> +#define   CXLDEV_MBOX_STATUS_BG_CMD BIT(0)
>
>Hmm. Oddly field is called Background Operation.  Still that is then
>described as a command so I guess this is a reasonable bit of consolidation
>of naming.

Yes, I've found that the spec loosely mixes both terms.

>
>>  #define   CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
>>  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
>> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
>
>Obviously there are line length disadvantages to includes the status part
>in the field names and there isn't a BG_CMD_COMMAND register thankfully
>but I still find the absence of status a bit inconsistent / confusing.
>initial instinct is this a field called OP_CODE in a register called
>BG_CMD_COMMAND
>
>> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
>> +#define   CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)
>
>It might be nice to to do 'something' with the Vendor Specific extended
>status even if it is just put it in a dev_dbg() for anyone who cares?

I had previously considered something like that, I'll add it in the
next iteration.

>
>>  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
>>
>>  /*
>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>> index 090acebba4fa..8c3302fc7738 100644
>> --- a/drivers/cxl/cxlmem.h
>> +++ b/drivers/cxl/cxlmem.h
>> @@ -108,6 +108,9 @@ static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
>>   *            variable sized output commands, it tells the exact number of bytes
>>   *            written.
>>   * @min_out: (input) internal command output payload size validation
>> + * @poll_count: (input)  Number of timeouts to attempt.
>> + * @poll_interval: (input) Number of ms between mailbox background command
>> + *                 polling intervals timeouts.
>
>name it poll_interval_ms: and the units become obvious everywhere without needing
>comments.  Good for the lazy / forgetful reviewer if nothing else...

Makes sense.

>
>>   * @return_code: (output) Error code returned from hardware.
>>   *
>>   * This is the primary mechanism used to send commands to the hardware.
>> @@ -123,6 +126,8 @@ struct cxl_mbox_cmd {
>>	size_t size_in;
>>	size_t size_out;
>>	size_t min_out;
>> +	int poll_count;
>> +	int poll_interval;
>>	u16 return_code;
>>  };
>>
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 39b829a29f6c..aa1bb74a52a1 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -51,6 +51,7 @@
>>  static unsigned short mbox_ready_timeout = 60;
>>  module_param(mbox_ready_timeout, ushort, 0644);
>>  MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
>> +static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
>
>I see in discussion you are moving to a per device approach so I won't review
>that bit on this version.

Right, fyi the latest vesion is here:

https://lore.kernel.org/linux-cxl/gtvozgdx2ak7tekc3heczk5g7gj3cwuoptez6tjmkecader4lo@7t2em7rclcxn/

...
>> +static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
>> +{
>> +	struct cxl_dev_state *cxlds = id;
>> +
>> +	/* spurious or raced with hw? */
>
>If talking about a race, I'd like a comment that gives more info.
>What is the potential hardware race?
>
>I can sort of see polling might have noticed completed command and
>launched another one, all before the interrupt actually got handled.
>If that's what you were thinking then eat the interrupt without the
>scary message.

Yes that's the raced with hw motivation, and the warning below can provide
insightful debug info. That said, for spurious irqs (which the kernel/core
considers this a reality, so yeah we should not be printing any message.

I'll get rid of it.

>
>> +	if (!cxl_mbox_background_complete(cxlds)) {
>> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
>> +
>> +		dev_warn(&pdev->dev,
>> +			 "Mailbox background operation IRQ but incomplete\n");
>> +		goto done;
>> +	}
>> +
>> +	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
>> +	wake_up(&mbox_wait);
>> +done:
>> +	return IRQ_HANDLED;
>> +}
>> +
>>  /**
>>   * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
>>   * @cxlds: The device state to communicate with.
>> @@ -178,7 +206,59 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
>>	mbox_cmd->return_code =
>>		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
>>
>> -	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
>> +	/*
>> +	 * Handle the background command in a synchronous manner.
>> +	 *
>> +	 * All other mailbox commands will serialize/queue on the mbox_mutex,
>> +	 * which we currently hold. Furthermore this also guarantees that
>> +	 * cxl_mbox_background_complete() checks are safe amongst each other,
>> +	 * in that no new bg operation can occur in between.
>> +	 *
>> +	 * Background operations are timesliced in accordance with the nature
>> +	 * of the command. In the event of timeout, the mailbox state is
>> +	 * indeterminate until the next successful command submission and the
>> +	 * driver can get back in sync with the hardware state.
>> +	 */
>> +	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
>> +		u64 bg_status_reg;
>> +		int i;
>> +
>> +		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
>> +			mbox_cmd->opcode);
>> +
>> +		for (i = 0; i < mbox_cmd->poll_count; i++) {
>> +			int ret = wait_event_interruptible_timeout(
>
>We already have an rc floating around in here.  Having ret as well with more limited
>scope isn't great for readability.  I think you can just use rc here.

I was thinking the same, I can rename it 'wait_ret' (it also ends up being a long in
the latest version).

>
>> +				mbox_wait, cxl_mbox_background_complete(cxlds),
>> +				msecs_to_jiffies(mbox_cmd->poll_interval));
>> +			if (ret > 0)
>> +				break;
>> +
>I'd drop this blank line to keep all the handling of ret in one block of code.
>It looks a bit too separate to me otherwise.
>
>> +			/* interrupted by a signal */
>> +			if (ret < 0)
>> +				return ret;
>> +		}
>> +
>> +		if (!cxl_mbox_background_complete(cxlds)) {
>> +			u64 md_status =
>> +				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
>> +
>> +			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
>> +				    "background timeout");
>
>Why are we interested in the memory device status at this point? A timeout on background
>command isn't really a cxl_cmd_err() in my head at least.

Will revisit.

>
>> +			return -ETIMEDOUT;
>> +		}
>> +
>> +		bg_status_reg = readq(cxlds->regs.mbox +
>> +				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
>> +		mbox_cmd->return_code =
>> +			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
>> +				  bg_status_reg);
>> +		dev_dbg(dev,
>> +			"Mailbox background operation (0x%04x) completed\n",
>> +			mbox_cmd->opcode);
>
>Here is where I'd like to also log the vendor specific extended status.

ok

>
>> +	}
>> +
>> +	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
>> +	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {
>
>You overrode the original return code with the background command return code. So if you get
>here and it's still RC_BACKGROUND I think something went wrong.

Yes, Ming had previously brought this up and is addressed in the last version.

>
>>		dev_dbg(dev, "Mailbox operation had an error: %s\n",
>>			cxl_mbox_cmd_rc2str(mbox_cmd));
>>		return 0; /* completed but caller must check return_code */
>> @@ -224,6 +304,7 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>>	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
>>	unsigned long timeout;
>>	u64 md_status;
>> +	int rc, irq;
>>
>>	timeout = jiffies + mbox_ready_timeout * HZ;
>>	do {
>> @@ -272,6 +353,27 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
>>	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
>>		cxlds->payload_size);
>>
>> +	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
>> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
>> +
>> +		irq = pci_irq_vector(pdev,
>> +			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
>> +		if (irq < 0)
>> +			goto mbox_poll;
>> +
>> +		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
>> +				      IRQF_SHARED, "mailbox", cxlds);
>> +		if (rc)
>> +			goto mbox_poll;
>
>Hmm. The old argument of whether to carry on when something unexpected happens.

Well yes and no. The reason I am very tolerant upon errors here is that the
background cmd polling will be done regardless of the device's interrupt
capability. So I find it way too harsh to just fail the probe altogether
when effectively no harm is done.

>I'd argue in this case at least and possibly the one above we should fail
>hard as we really want to know if interrupt allocations are failing, not just
>fall back quietly to polling.  I'd rather fail to probe the driver in a fashion
>that lets us figure out what broke.
>
>> +
>> +		writel(CXLDEV_MBOX_CTRL_BG_CMD_IRQ,
>> +		       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
>> +
>> +		return 0;
>> +	}
>> +
>> +mbox_poll:
>> +	dev_dbg(cxlds->dev, "Mailbox interrupts are unsupported");
>>	return 0;
>>  }
>>

Thanks for reviewing!
Jonathan Cameron May 12, 2023, 5:05 p.m. UTC | #10
> >  
> >>   * @return_code: (output) Error code returned from hardware.
> >>   *
> >>   * This is the primary mechanism used to send commands to the hardware.
> >> @@ -123,6 +126,8 @@ struct cxl_mbox_cmd {
> >>	size_t size_in;
> >>	size_t size_out;
> >>	size_t min_out;
> >> +	int poll_count;
> >> +	int poll_interval;
> >>	u16 return_code;
> >>  };
> >>
> >> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> >> index 39b829a29f6c..aa1bb74a52a1 100644
> >> --- a/drivers/cxl/pci.c
> >> +++ b/drivers/cxl/pci.c
> >> @@ -51,6 +51,7 @@
> >>  static unsigned short mbox_ready_timeout = 60;
> >>  module_param(mbox_ready_timeout, ushort, 0644);
> >>  MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
> >> +static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);  
> >
> >I see in discussion you are moving to a per device approach so I won't review
> >that bit on this version.  
> 
> Right, fyi the latest vesion is here:
> 

> https://lore.kernel.org/linux-cxl/gtvozgdx2ak7tekc3heczk5g7gj3cwuoptez6tjmkecader4lo@7t2em7rclcxn/
I'll pretend I'll look at that :)
(81 more emails to read on CXL alone.. sigh


> >  
> >>		dev_dbg(dev, "Mailbox operation had an error: %s\n",
> >>			cxl_mbox_cmd_rc2str(mbox_cmd));
> >>		return 0; /* completed but caller must check return_code */
> >> @@ -224,6 +304,7 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
> >>	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
> >>	unsigned long timeout;
> >>	u64 md_status;
> >> +	int rc, irq;
> >>
> >>	timeout = jiffies + mbox_ready_timeout * HZ;
> >>	do {
> >> @@ -272,6 +353,27 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
> >>	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
> >>		cxlds->payload_size);
> >>
> >> +	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
> >> +		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> >> +
> >> +		irq = pci_irq_vector(pdev,
> >> +			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
> >> +		if (irq < 0)
> >> +			goto mbox_poll;
> >> +
> >> +		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
> >> +				      IRQF_SHARED, "mailbox", cxlds);
> >> +		if (rc)
> >> +			goto mbox_poll;  
> >
> >Hmm. The old argument of whether to carry on when something unexpected happens.  
> 
> Well yes and no. The reason I am very tolerant upon errors here is that the
> background cmd polling will be done regardless of the device's interrupt
> capability. So I find it way too harsh to just fail the probe altogether
> when effectively no harm is done.

We'll never teach people to do things right if their broken config works anyway! :)
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 6198637cb0bb..cde7270c6037 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -180,7 +180,8 @@  int cxl_internal_send_cmd(struct cxl_dev_state *cxlds,
 	if (rc)
 		return rc;
 
-	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS)
+	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
+	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND)
 		return cxl_mbox_cmd_rc2errno(mbox_cmd);
 
 	if (!out_size)
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 044a92d9813e..72731a896f58 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -176,14 +176,21 @@  static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
 /* CXL 2.0 8.2.8.4 Mailbox Registers */
 #define CXLDEV_MBOX_CAPS_OFFSET 0x00
 #define   CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
+#define   CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
+#define   CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)
 #define CXLDEV_MBOX_CTRL_OFFSET 0x04
 #define   CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
+#define   CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
 #define CXLDEV_MBOX_CMD_OFFSET 0x08
 #define   CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
 #define   CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
 #define CXLDEV_MBOX_STATUS_OFFSET 0x10
+#define   CXLDEV_MBOX_STATUS_BG_CMD BIT(0)
 #define   CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
 #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)
 #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
 
 /*
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 090acebba4fa..8c3302fc7738 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -108,6 +108,9 @@  static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
  *            variable sized output commands, it tells the exact number of bytes
  *            written.
  * @min_out: (input) internal command output payload size validation
+ * @poll_count: (input)  Number of timeouts to attempt.
+ * @poll_interval: (input) Number of ms between mailbox background command
+ *                 polling intervals timeouts.
  * @return_code: (output) Error code returned from hardware.
  *
  * This is the primary mechanism used to send commands to the hardware.
@@ -123,6 +126,8 @@  struct cxl_mbox_cmd {
 	size_t size_in;
 	size_t size_out;
 	size_t min_out;
+	int poll_count;
+	int poll_interval;
 	u16 return_code;
 };
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 39b829a29f6c..aa1bb74a52a1 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -51,6 +51,7 @@ 
 static unsigned short mbox_ready_timeout = 60;
 module_param(mbox_ready_timeout, ushort, 0644);
 MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
+static DECLARE_WAIT_QUEUE_HEAD(mbox_wait);
 
 static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
 {
@@ -85,6 +86,33 @@  static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
 			    status & CXLMDEV_DEV_FATAL ? " fatal" : "",        \
 			    status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
 
+static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
+{
+	u64 reg;
+
+	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
+}
+
+static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
+{
+	struct cxl_dev_state *cxlds = id;
+
+	/* spurious or raced with hw? */
+	if (!cxl_mbox_background_complete(cxlds)) {
+		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+
+		dev_warn(&pdev->dev,
+			 "Mailbox background operation IRQ but incomplete\n");
+		goto done;
+	}
+
+	/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
+	wake_up(&mbox_wait);
+done:
+	return IRQ_HANDLED;
+}
+
 /**
  * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
  * @cxlds: The device state to communicate with.
@@ -178,7 +206,59 @@  static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
 	mbox_cmd->return_code =
 		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
 
-	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
+	/*
+	 * Handle the background command in a synchronous manner.
+	 *
+	 * All other mailbox commands will serialize/queue on the mbox_mutex,
+	 * which we currently hold. Furthermore this also guarantees that
+	 * cxl_mbox_background_complete() checks are safe amongst each other,
+	 * in that no new bg operation can occur in between.
+	 *
+	 * Background operations are timesliced in accordance with the nature
+	 * of the command. In the event of timeout, the mailbox state is
+	 * indeterminate until the next successful command submission and the
+	 * driver can get back in sync with the hardware state.
+	 */
+	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
+		u64 bg_status_reg;
+		int i;
+
+		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
+			mbox_cmd->opcode);
+
+		for (i = 0; i < mbox_cmd->poll_count; i++) {
+			int ret = wait_event_interruptible_timeout(
+				mbox_wait, cxl_mbox_background_complete(cxlds),
+				msecs_to_jiffies(mbox_cmd->poll_interval));
+			if (ret > 0)
+				break;
+
+			/* interrupted by a signal */
+			if (ret < 0)
+				return ret;
+		}
+
+		if (!cxl_mbox_background_complete(cxlds)) {
+			u64 md_status =
+				readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+			cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
+				    "background timeout");
+			return -ETIMEDOUT;
+		}
+
+		bg_status_reg = readq(cxlds->regs.mbox +
+				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+		mbox_cmd->return_code =
+			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
+				  bg_status_reg);
+		dev_dbg(dev,
+			"Mailbox background operation (0x%04x) completed\n",
+			mbox_cmd->opcode);
+	}
+
+	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
+	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) {
 		dev_dbg(dev, "Mailbox operation had an error: %s\n",
 			cxl_mbox_cmd_rc2str(mbox_cmd));
 		return 0; /* completed but caller must check return_code */
@@ -224,6 +304,7 @@  static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
 	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
 	unsigned long timeout;
 	u64 md_status;
+	int rc, irq;
 
 	timeout = jiffies + mbox_ready_timeout * HZ;
 	do {
@@ -272,6 +353,27 @@  static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
 	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
 		cxlds->payload_size);
 
+	if (cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) {
+		struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+
+		irq = pci_irq_vector(pdev,
+			     FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap));
+		if (irq < 0)
+			goto mbox_poll;
+
+		rc = devm_request_irq(cxlds->dev, irq, cxl_pci_mbox_irq,
+				      IRQF_SHARED, "mailbox", cxlds);
+		if (rc)
+			goto mbox_poll;
+
+		writel(CXLDEV_MBOX_CTRL_BG_CMD_IRQ,
+		       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+
+		return 0;
+	}
+
+mbox_poll:
+	dev_dbg(cxlds->dev, "Mailbox interrupts are unsupported");
 	return 0;
 }