diff mbox series

[v2,06/10] vhost-scsi: cache log buffer in I/O queue vhost_scsi_cmd

Message ID 20250317235546.4546-7-dongli.zhang@oracle.com (mailing list archive)
State New
Headers show
Series vhost-scsi: log write descriptors for live migration (and three bugfix) | expand

Commit Message

Dongli Zhang March 17, 2025, 11:55 p.m. UTC
The vhost-scsi I/O queue uses vhost_scsi_cmd. Allocate the log buffer
during vhost_scsi_cmd allocation or when VHOST_F_LOG_ALL is set. Free the
log buffer when vhost_scsi_cmd is reclaimed or when VHOST_F_LOG_ALL is
removed.

Fail vhost_scsi_set_endpoint or vhost_scsi_set_features() on allocation
failure.

The cached log buffer will be uses in upcoming patches to log write
descriptors for the I/O queue. The core idea is to cache the log in the
per-command log buffer in the submission path, and use them to log write
descriptors in the completion path.

As a reminder, currently QEMU's vhost-scsi VHOST_SET_FEATURES handler
doesn't process the failure gracefully. Instead, it crashes immediately on
failure from VHOST_SET_FEATURES.

Suggested-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
Changed since v1:
  - Don't allocate log buffer during initialization. Allocate during
    VHOST_SET_FEATURES or VHOST_SCSI_SET_ENDPOINT.

 drivers/vhost/scsi.c | 126 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

Comments

Dongli Zhang March 18, 2025, 12:04 a.m. UTC | #1
Hi Mike,

On 3/17/25 4:55 PM, Dongli Zhang wrote:
> The vhost-scsi I/O queue uses vhost_scsi_cmd. Allocate the log buffer
> during vhost_scsi_cmd allocation or when VHOST_F_LOG_ALL is set. Free the
> log buffer when vhost_scsi_cmd is reclaimed or when VHOST_F_LOG_ALL is
> removed.
> 
> Fail vhost_scsi_set_endpoint or vhost_scsi_set_features() on allocation
> failure.
> 
> The cached log buffer will be uses in upcoming patches to log write
> descriptors for the I/O queue. The core idea is to cache the log in the
> per-command log buffer in the submission path, and use them to log write
> descriptors in the completion path.
> 
> As a reminder, currently QEMU's vhost-scsi VHOST_SET_FEATURES handler
> doesn't process the failure gracefully. Instead, it crashes immediately on
> failure from VHOST_SET_FEATURES.
> 

We have discussed the allocation of log buffer at:

https://lore.kernel.org/all/b058d4c6-f8cf-456b-aa60-8a8ccedb277e@oracle.com/

This patchset allocate and free log buffer during
VHOST_SET_FEATURES/VHOST_SCSI_SET_ENDPOINT.

Unfortunately, QEMU's VHOST_SET_FEATURES handler may crash QEMU if there is
error from VHOST_SET_FEATURES (i.e. -ENOMEM).

From user's perspective, it is better to never start VHOST_F_LOG_ALL if
memory isn't enough. However, that requires change at QEMU side.


I have another implementation by combining PATCH 06 and PATCH 07 in this
patchset.

The core idea is to allocate only once for each cmd. In addition, don't
free log buffer unless VHOST_F_LOG_ALL is removed, or during
VHOST_SCSI_SET_ENDPOINT when all commands are destroyed.

It returns SAM_STAT_TASK_SET_FULL to guest when allocation is failed.

-------------------------------

[PATCH v2 6/9] vhost-scsi: log I/O queue write descriptors

Log write descriptors for the I/O queue, leveraging vhost_scsi_get_desc()
and vhost_get_vq_desc() to retrieve the array of write descriptors to
obtain the log buffer.

In addition, introduce a vhost-scsi specific function to log vring
descriptors. In this function, the 'partial' argument is set to false, and
the 'len' argument is set to 0, because vhost-scsi always logs all pages
shared by a vring descriptor. Add WARN_ON_ONCE() since vhost-scsi doesn't
support VIRTIO_F_ACCESS_PLATFORM.

The per-cmd log buffer is allocated on demand in the submission path after
VHOST_F_LOG_ALL is set. Return -ENOMEM on allocation failure, in order to
send SAM_STAT_TASK_SET_FULL to the guest.

It isn't reclaimed in the completion path. Instead, it is reclaimed when
VHOST_F_LOG_ALL is removed, or during VHOST_SCSI_SET_ENDPOINT when all
commands are destroyed.

Store the log buffer during the submission path and log it in the
completion path. Logging is also required in the error handling path of the
submission process.

Suggested-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
Changed since v1:
  - Don't allocate log buffer during initialization. Allocate only once for
    each command. Don't free until not used any longer.
  - Re-order if staments in vhost_scsi_log_write().
  - Log after vhost_scsi_send_status() as well.

 drivers/vhost/scsi.c | 108 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3875967dee36..6ae4d7de9a5f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -133,6 +133,11 @@ struct vhost_scsi_cmd {
 	struct se_cmd tvc_se_cmd;
 	/* Sense buffer that will be mapped into outgoing status */
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
+	/*
+	 * Dirty write descriptors of this command.
+	 */
+	struct vhost_log *tvc_log;
+	unsigned int tvc_log_num;
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
 	/* Used to track inflight cmd */
@@ -362,6 +367,24 @@ static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg)
 	return tpg->tv_fabric_prot_type;
 }

+static void vhost_scsi_log_write(struct vhost_virtqueue *vq,
+				 struct vhost_log *log,
+				 unsigned int log_num)
+{
+	if (likely(!vhost_has_feature(vq, VHOST_F_LOG_ALL)))
+		return;
+
+	if (likely(!log_num || !log))
+		return;
+
+	/*
+	 * vhost-scsi doesn't support VIRTIO_F_ACCESS_PLATFORM.
+	 * No requirement for vq->iotlb case.
+	 */
+	WARN_ON_ONCE(unlikely(vq->iotlb));
+	vhost_log_write(vq, log, log_num, 0, false, NULL, 0);
+}
+
 static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
 {
 	struct vhost_scsi_cmd *tv_cmd = container_of(se_cmd,
@@ -660,6 +683,9 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		} else
 			pr_err("Faulted on virtio_scsi_cmd_resp\n");

+		vhost_scsi_log_write(cmd->tvc_vq, cmd->tvc_log,
+				     cmd->tvc_log_num);
+
 		vhost_scsi_release_cmd_res(se_cmd);
 	}

@@ -676,6 +702,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag)
 					struct vhost_scsi_virtqueue, vq);
 	struct vhost_scsi_cmd *cmd;
 	struct scatterlist *sgl, *prot_sgl;
+	struct vhost_log *log;
 	int tag;

 	tag = sbitmap_get(&svq->scsi_tags);
@@ -687,9 +714,11 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag)
 	cmd = &svq->scsi_cmds[tag];
 	sgl = cmd->sgl;
 	prot_sgl = cmd->prot_sgl;
+	log = cmd->tvc_log;
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sgl = sgl;
 	cmd->prot_sgl = prot_sgl;
+	cmd->tvc_log = log;
 	cmd->tvc_se_cmd.map_tag = tag;
 	cmd->inflight = vhost_scsi_get_inflight(vq);

@@ -1225,6 +1254,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	u8 task_attr;
 	bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI);
 	u8 *cdb;
+	struct vhost_log *vq_log;
+	unsigned int log_num;

 	mutex_lock(&vq->mutex);
 	/*
@@ -1240,8 +1271,11 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)

 	vhost_disable_notify(&vs->dev, vq);

+	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
+		vq->log : NULL;
+
 	do {
-		ret = vhost_scsi_get_desc(vs, vq, &vc, NULL, NULL);
+		ret = vhost_scsi_get_desc(vs, vq, &vc, vq_log, &log_num);
 		if (ret)
 			goto err;

@@ -1390,6 +1424,24 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			goto err;
 		}

+		if (unlikely(vq_log && log_num)) {
+			if (!cmd->tvc_log)
+				cmd->tvc_log = kmalloc_array(vq->dev->iov_limit,
+							     sizeof(*cmd->tvc_log),
+							     GFP_KERNEL);
+
+			if (likely(cmd->tvc_log)) {
+				memcpy(cmd->tvc_log, vq->log,
+				       sizeof(*cmd->tvc_log) * log_num);
+				cmd->tvc_log_num = log_num;
+			} else {
+				ret = -ENOMEM;
+				vq_err(vq, "Failed to alloc tvc_log\n");
+				vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd);
+				goto err;
+			}
+		}
+
 		pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n",
 			 cdb[0], lun);
 		pr_debug("cmd: %p exp_data_len: %d, prot_bytes: %d data_direction:"
@@ -1425,11 +1477,14 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 */
 		if (ret == -ENXIO)
 			break;
-		else if (ret == -EIO)
+		else if (ret == -EIO) {
 			vhost_scsi_send_bad_target(vs, vq, &vc, TYPE_IO_CMD);
-		else if (ret == -ENOMEM)
+			vhost_scsi_log_write(vq, vq_log, log_num);
+		} else if (ret == -ENOMEM) {
 			vhost_scsi_send_status(vs, vq, &vc,
 					       SAM_STAT_TASK_SET_FULL);
+			vhost_scsi_log_write(vq, vq_log, log_num);
+		}
 	} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
 out:
 	mutex_unlock(&vq->mutex);
@@ -1760,6 +1815,24 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
 		wait_for_completion(&vs->old_inflight[i]->comp);
 }

+static void vhost_scsi_destroy_vq_log(struct vhost_virtqueue *vq)
+{
+	struct vhost_scsi_virtqueue *svq = container_of(vq,
+					struct vhost_scsi_virtqueue, vq);
+	struct vhost_scsi_cmd *tv_cmd;
+	unsigned int i;
+
+	if (!svq->scsi_cmds)
+		return;
+
+	for (i = 0; i < svq->max_cmds; i++) {
+		tv_cmd = &svq->scsi_cmds[i];
+		kfree(tv_cmd->tvc_log);
+		tv_cmd->tvc_log = NULL;
+		tv_cmd->tvc_log_num = 0;
+	}
+}
+
 static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq)
 {
 	struct vhost_scsi_virtqueue *svq = container_of(vq,
@@ -1779,6 +1852,7 @@ static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq)

 	sbitmap_free(&svq->scsi_tags);
 	kfree(svq->upages);
+	vhost_scsi_destroy_vq_log(vq);
 	kfree(svq->scsi_cmds);
 	svq->scsi_cmds = NULL;
 }
@@ -2088,6 +2162,7 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs,
 static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 {
 	struct vhost_virtqueue *vq;
+	bool is_log, was_log;
 	int i;

 	if (features & ~VHOST_SCSI_FEATURES)
@@ -2100,12 +2175,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 		return -EFAULT;
 	}

+	if (!vs->dev.nvqs)
+		goto out;
+
+	is_log = features & (1 << VHOST_F_LOG_ALL);
+	/*
+	 * All VQs should have same feature.
+	 */
+	was_log = vhost_has_feature(&vs->vqs[0].vq, VHOST_F_LOG_ALL);
+
 	for (i = 0; i < vs->dev.nvqs; i++) {
 		vq = &vs->vqs[i].vq;
 		mutex_lock(&vq->mutex);
 		vq->acked_features = features;
 		mutex_unlock(&vq->mutex);
 	}
+
+	/*
+	 * If VHOST_F_LOG_ALL is removed, free tvc_log after
+	 * vq->acked_features is committed.
+	 */
+	if (!is_log && was_log) {
+		for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) {
+			if (!vs->vqs[i].scsi_cmds)
+				continue;
+
+			vq = &vs->vqs[i].vq;
+			mutex_lock(&vq->mutex);
+			vhost_scsi_destroy_vq_log(vq);
+			mutex_unlock(&vq->mutex);
+		}
+	}
+
+out:
 	mutex_unlock(&vs->dev.mutex);
 	return 0;
 }
diff mbox series

Patch

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3875967dee36..1b7211a55562 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -133,6 +133,11 @@  struct vhost_scsi_cmd {
 	struct se_cmd tvc_se_cmd;
 	/* Sense buffer that will be mapped into outgoing status */
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
+	/*
+	 * Dirty write descriptors of this command.
+	 */
+	struct vhost_log *tvc_log;
+	unsigned int tvc_log_num;
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
 	/* Used to track inflight cmd */
@@ -676,6 +681,7 @@  vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag)
 					struct vhost_scsi_virtqueue, vq);
 	struct vhost_scsi_cmd *cmd;
 	struct scatterlist *sgl, *prot_sgl;
+	struct vhost_log *log;
 	int tag;
 
 	tag = sbitmap_get(&svq->scsi_tags);
@@ -687,9 +693,11 @@  vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag)
 	cmd = &svq->scsi_cmds[tag];
 	sgl = cmd->sgl;
 	prot_sgl = cmd->prot_sgl;
+	log = cmd->tvc_log;
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sgl = sgl;
 	cmd->prot_sgl = prot_sgl;
+	cmd->tvc_log = log;
 	cmd->tvc_se_cmd.map_tag = tag;
 	cmd->inflight = vhost_scsi_get_inflight(vq);
 
@@ -1760,6 +1768,55 @@  static void vhost_scsi_flush(struct vhost_scsi *vs)
 		wait_for_completion(&vs->old_inflight[i]->comp);
 }
 
+static void vhost_scsi_destroy_vq_log(struct vhost_virtqueue *vq)
+{
+	struct vhost_scsi_virtqueue *svq = container_of(vq,
+					struct vhost_scsi_virtqueue, vq);
+	struct vhost_scsi_cmd *tv_cmd;
+	unsigned int i;
+
+	if (!svq->scsi_cmds)
+		return;
+
+	for (i = 0; i < svq->max_cmds; i++) {
+		tv_cmd = &svq->scsi_cmds[i];
+		kfree(tv_cmd->tvc_log);
+		tv_cmd->tvc_log = NULL;
+		tv_cmd->tvc_log_num = 0;
+	}
+}
+
+static int vhost_scsi_setup_vq_log(struct vhost_virtqueue *vq)
+{
+	struct vhost_scsi_virtqueue *svq = container_of(vq,
+					struct vhost_scsi_virtqueue, vq);
+	struct vhost_scsi_cmd *tv_cmd;
+	unsigned int i;
+
+	if (!svq->scsi_cmds)
+		return 0;
+
+	for (i = 0; i < svq->max_cmds; i++) {
+		tv_cmd = &svq->scsi_cmds[i];
+		WARN_ON_ONCE(unlikely(tv_cmd->tvc_log ||
+				      tv_cmd->tvc_log_num));
+		tv_cmd->tvc_log_num = 0;
+		tv_cmd->tvc_log = kcalloc(vq->dev->iov_limit,
+					  sizeof(struct vhost_log),
+					  GFP_KERNEL);
+		if (!tv_cmd->tvc_log) {
+			pr_err("Unable to allocate tv_cmd->tvc_log\n");
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	vhost_scsi_destroy_vq_log(vq);
+	return -ENOMEM;
+}
+
 static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq)
 {
 	struct vhost_scsi_virtqueue *svq = container_of(vq,
@@ -1779,6 +1836,7 @@  static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq)
 
 	sbitmap_free(&svq->scsi_tags);
 	kfree(svq->upages);
+	vhost_scsi_destroy_vq_log(vq);
 	kfree(svq->scsi_cmds);
 	svq->scsi_cmds = NULL;
 }
@@ -1834,6 +1892,11 @@  static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds)
 			}
 		}
 	}
+
+	if (vhost_has_feature(vq, VHOST_F_LOG_ALL) &&
+	    vhost_scsi_setup_vq_log(vq))
+		goto out;
+
 	return 0;
 out:
 	vhost_scsi_destroy_vq_cmds(vq);
@@ -2088,6 +2151,8 @@  vhost_scsi_clear_endpoint(struct vhost_scsi *vs,
 static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 {
 	struct vhost_virtqueue *vq;
+	bool is_log, was_log;
+	int ret;
 	int i;
 
 	if (features & ~VHOST_SCSI_FEATURES)
@@ -2100,14 +2165,75 @@  static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 		return -EFAULT;
 	}
 
+	if (!vs->dev.nvqs)
+		goto out;
+
+	is_log = features & (1 << VHOST_F_LOG_ALL);
+	/*
+	 * All VQs should have same feature.
+	 */
+	was_log = vhost_has_feature(&vs->vqs[0].vq, VHOST_F_LOG_ALL);
+
+	/*
+	 * If VHOST_F_LOG_ALL is going to be added, allocate tvc_log before
+	 * vq->acked_features is committed.
+	 * Return -ENOMEM on allocation failure.
+	 */
+	if (is_log && !was_log) {
+		for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) {
+			if (!vs->vqs[i].scsi_cmds)
+				continue;
+
+			vq = &vs->vqs[i].vq;
+
+			mutex_lock(&vq->mutex);
+			ret = vhost_scsi_setup_vq_log(vq);
+			mutex_unlock(&vq->mutex);
+
+			if (ret)
+				goto destroy_cmd_log;
+		}
+	}
+
 	for (i = 0; i < vs->dev.nvqs; i++) {
 		vq = &vs->vqs[i].vq;
 		mutex_lock(&vq->mutex);
 		vq->acked_features = features;
 		mutex_unlock(&vq->mutex);
 	}
+
+	/*
+	 * If VHOST_F_LOG_ALL is removed, free tvc_log after
+	 * vq->acked_features is committed.
+	 */
+	if (!is_log && was_log) {
+		for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) {
+			if (!vs->vqs[i].scsi_cmds)
+				continue;
+
+			vq = &vs->vqs[i].vq;
+			mutex_lock(&vq->mutex);
+			vhost_scsi_destroy_vq_log(vq);
+			mutex_unlock(&vq->mutex);
+		}
+	}
+
+out:
 	mutex_unlock(&vs->dev.mutex);
 	return 0;
+
+destroy_cmd_log:
+	for (i--; i >= VHOST_SCSI_VQ_IO; i--) {
+		if (!vs->vqs[i].scsi_cmds)
+			continue;
+
+		vq = &vs->vqs[i].vq;
+		mutex_lock(&vq->mutex);
+		vhost_scsi_destroy_vq_log(vq);
+		mutex_unlock(&vq->mutex);
+	}
+	mutex_unlock(&vs->dev.mutex);
+	return -ENOMEM;
 }
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)