diff mbox series

[1/3] vhost-scsi: Fix alignment handling with windows

Message ID 20230524233407.41432-2-michael.christie@oracle.com (mailing list archive)
State Changes Requested
Headers show
Series vhost-scsi: Fix IO hangs when using windows | expand

Commit Message

Mike Christie May 24, 2023, 11:34 p.m. UTC
The linux block layer requires bios/requests to have lengths with a 512
byte alignment. Some drivers/layers like dm-crypt and the directi IO code
will test for it and just fail. Other drivers like SCSI just assume the
requirement is met and will end up in infinte retry loops. The problem
for drivers like SCSI is that it uses functions like blk_rq_cur_sectors
and blk_rq_sectors which divide the request's length by 512. If there's
lefovers then it just gets dropped. But other code in the block/scsi
layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is
still data left and try to retry the cmd. We can then end up getting
stuck in retry loops where part of the block/scsi thinks there is data
left, but other parts think we want to do IOs of zero length.

Linux will always check for alignment, but windows will not. When
vhost-scsi then translates the iovec it gets from a windows guest to a
scatterlist, we can end up with sg items where the sg->length is not
divisible by 512 due to the misaligned offset:

sg[0].offset = 255;
sg[0].length = 3841;
sg...
sg[N].offset = 0;
sg[N].length = 255;

When the lio backends then convert the SG to bios or other iovecs, we
end up sending them with the same misaligned values and can hit the
issues above.

This just has us drop down to allocating a temp page and copying the data
when this happens. Because this adds a check that needs to loop over the
iovec in the main IO path, this patch adds an attribute that can be turned
on for just windows.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 151 insertions(+), 23 deletions(-)

Comments

Michael S. Tsirkin May 25, 2023, 7:57 a.m. UTC | #1
On Wed, May 24, 2023 at 06:34:05PM -0500, Mike Christie wrote:
> The linux block layer requires bios/requests to have lengths with a 512
> byte alignment. Some drivers/layers like dm-crypt and the directi IO code
> will test for it and just fail. Other drivers like SCSI just assume the
> requirement is met and will end up in infinte retry loops. The problem
> for drivers like SCSI is that it uses functions like blk_rq_cur_sectors
> and blk_rq_sectors which divide the request's length by 512. If there's
> lefovers then it just gets dropped. But other code in the block/scsi
> layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is
> still data left and try to retry the cmd. We can then end up getting
> stuck in retry loops where part of the block/scsi thinks there is data
> left, but other parts think we want to do IOs of zero length.
> 
> Linux will always check for alignment, but windows will not. When
> vhost-scsi then translates the iovec it gets from a windows guest to a
> scatterlist, we can end up with sg items where the sg->length is not
> divisible by 512 due to the misaligned offset:
> 
> sg[0].offset = 255;
> sg[0].length = 3841;
> sg...
> sg[N].offset = 0;
> sg[N].length = 255;
> 
> When the lio backends then convert the SG to bios or other iovecs, we
> end up sending them with the same misaligned values and can hit the
> issues above.
> 
> This just has us drop down to allocating a temp page and copying the data
> when this happens. Because this adds a check that needs to loop over the
> iovec in the main IO path, this patch adds an attribute that can be turned
> on for just windows.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>

I am assuming this triggers rarely, yes?

If so I would like to find a way to avoid setting an attribute.

I am guessing the main cost is an extra scan of iov.
vhost_scsi_iov_to_sgl already does a scan, how about changing
it to fail if misaligned?
We can then detect the relevant error code and try with a copy.

WDYT?

> ---
>  drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 151 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index bb10fa4bb4f6..dbad8fb579eb 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -25,6 +25,8 @@
>  #include <linux/fs.h>
>  #include <linux/vmalloc.h>
>  #include <linux/miscdevice.h>
> +#include <linux/blk_types.h>
> +#include <linux/bio.h>
>  #include <asm/unaligned.h>
>  #include <scsi/scsi_common.h>
>  #include <scsi/scsi_proto.h>
> @@ -75,6 +77,9 @@ struct vhost_scsi_cmd {
>  	u32 tvc_prot_sgl_count;
>  	/* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */
>  	u32 tvc_lun;
> +	u32 copied_iov:1;
> +	const void *saved_iter_addr;
> +	struct iov_iter saved_iter;
>  	/* Pointer to the SGL formatted memory from virtio-scsi */
>  	struct scatterlist *tvc_sgl;
>  	struct scatterlist *tvc_prot_sgl;
> @@ -107,6 +112,7 @@ struct vhost_scsi_nexus {
>  struct vhost_scsi_tpg {
>  	/* Vhost port target portal group tag for TCM */
>  	u16 tport_tpgt;
> +	bool check_io_alignment;
>  	/* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
>  	int tv_tpg_port_count;
>  	/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
> @@ -328,8 +334,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
>  	int i;
>  
>  	if (tv_cmd->tvc_sgl_count) {
> -		for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
> -			put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +		for (i = 0; i < tv_cmd->tvc_sgl_count; i++) {
> +			if (tv_cmd->copied_iov)
> +				__free_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +			else
> +				put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +		}
> +		kfree(tv_cmd->saved_iter_addr);
>  	}
>  	if (tv_cmd->tvc_prot_sgl_count) {
>  		for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
> @@ -502,6 +513,27 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
>  	mutex_unlock(&vq->mutex);
>  }
>  
> +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
> +{
> +	struct iov_iter *iter = &cmd->saved_iter;
> +	struct scatterlist *sg = cmd->tvc_sgl;
> +	struct page *page;
> +	size_t len;
> +	int i;
> +
> +	for (i = 0; i < cmd->tvc_sgl_count; i++) {
> +		page = sg_page(&sg[i]);
> +		len = sg[i].length;
> +
> +		if (copy_page_to_iter(page, 0, len, iter) != len) {
> +			pr_err("Could not copy data. Error %lu\n", len);
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /* Fill in status and signal that we are done processing this command
>   *
>   * This is scheduled in the vhost work queue so we are called with the owner
> @@ -525,15 +557,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  
>  		pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
>  			cmd, se_cmd->residual_count, se_cmd->scsi_status);
> -
>  		memset(&v_rsp, 0, sizeof(v_rsp));
> -		v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count);
> -		/* TODO is status_qualifier field needed? */
> -		v_rsp.status = se_cmd->scsi_status;
> -		v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> -						 se_cmd->scsi_sense_length);
> -		memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> -		       se_cmd->scsi_sense_length);
> +
> +		if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) {
> +			v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
> +		} else {
> +			v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq,
> +						     se_cmd->residual_count);
> +			/* TODO is status_qualifier field needed? */
> +			v_rsp.status = se_cmd->scsi_status;
> +			v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> +							 se_cmd->scsi_sense_length);
> +			memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> +			       se_cmd->scsi_sense_length);
> +		}
>  
>  		iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov,
>  			      cmd->tvc_in_iovs, sizeof(v_rsp));
> @@ -682,7 +719,52 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
>  }
>  
>  static int
> -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
> +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
> +			   struct scatterlist *sg, int sg_count)
> +{
> +	size_t len = iov_iter_count(iter);
> +	unsigned int nbytes = 0;
> +	struct page *page;
> +	int i;
> +
> +	if (cmd->tvc_data_direction == DMA_FROM_DEVICE) {
> +		cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter,
> +						GFP_KERNEL);
> +		if (!cmd->saved_iter_addr)
> +			return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < sg_count; i++) {
> +		page = alloc_page(GFP_KERNEL);
> +		if (!page) {
> +			i--;
> +			goto err;
> +		}
> +
> +		nbytes = min_t(unsigned int, PAGE_SIZE, len);
> +		sg_set_page(&sg[i], page, nbytes, 0);
> +
> +		if (cmd->tvc_data_direction == DMA_TO_DEVICE &&
> +		    copy_page_from_iter(page, 0, nbytes, iter) != nbytes)
> +			goto err;
> +
> +		len -= nbytes;
> +	}
> +
> +	cmd->copied_iov = 1;
> +	return 0;
> +
> +err:
> +	pr_err("Could not read %u bytes\n", nbytes);
> +
> +	for (; i >= 0; i--)
> +		__free_page(sg_page(&sg[i]));
> +	kfree(cmd->saved_iter_addr);
> +	return -ENOMEM;
> +}
> +
> +static int
> +vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd,
>  		 size_t prot_bytes, struct iov_iter *prot_iter,
>  		 size_t data_bytes, struct iov_iter *data_iter)
>  {
> @@ -703,10 +785,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>  		ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter,
>  					    cmd->tvc_prot_sgl,
>  					    cmd->tvc_prot_sgl_count);
> -		if (ret < 0) {
> -			cmd->tvc_prot_sgl_count = 0;
> -			return ret;
> -		}
> +		if (ret < 0)
> +			goto map_fail;
>  	}
>  	sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes,
>  					 VHOST_SCSI_PREALLOC_SGLS);
> @@ -717,14 +797,32 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>  	cmd->tvc_sgl_count = sgl_count;
>  	pr_debug("%s data_sg %p data_sgl_count %u\n", __func__,
>  		  cmd->tvc_sgl, cmd->tvc_sgl_count);
> +	/*
> +	 * The block layer requires bios/requests to be a multiple of 512 bytes,
> +	 * but Windows can send us vecs that are misaligned. This can result
> +	 * in bios and later requests with misaligned sizes if we have to break
> +	 * up a cmd into multiple bios.
> +	 *
> +	 * We currently only break up a command into multiple bios if we hit
> +	 * the vec/seg limit, so check if our sgl_count is greater than the max
> +	 * and if a vec in the cmd has a misaligned size.
> +	 */
> +	if (tpg->check_io_alignment &&
> +	    (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) &&
> +	     bio_max_segs(sgl_count) != sgl_count))
> +		ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
> +						 cmd->tvc_sgl_count);
> +	else
> +		ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> +					    cmd->tvc_sgl, cmd->tvc_sgl_count);
> +	if (ret)
> +		goto map_fail;
>  
> -	ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> -				    cmd->tvc_sgl, cmd->tvc_sgl_count);
> -	if (ret < 0) {
> -		cmd->tvc_sgl_count = 0;
> -		return ret;
> -	}
>  	return 0;
> +
> +map_fail:
> +	cmd->tvc_sgl_count = 0;
> +	return ret;
>  }
>  
>  static int vhost_scsi_to_tcm_attr(int attr)
> @@ -1077,7 +1175,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
>  			 " %d\n", cmd, exp_data_len, prot_bytes, data_direction);
>  
>  		if (data_direction != DMA_NONE) {
> -			if (unlikely(vhost_scsi_mapal(cmd, prot_bytes,
> +			if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes,
>  						      &prot_iter, exp_data_len,
>  						      &data_iter))) {
>  				vq_err(vq, "Failed to map iov to sgl\n");
> @@ -2068,11 +2166,41 @@ static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show(
>  
>  	return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type);
>  }
> -
>  CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type);
>  
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item,
> +					       const char *page, size_t count)
> +{
> +	struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +						  se_tpg);
> +	bool val;
> +	int ret;
> +
> +	ret = kstrtobool(page, &val);
> +	if (ret)
> +		return ret;
> +
> +	tpg->check_io_alignment = val;
> +	return count;
> +}
> +
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item,
> +					      char *page)
> +{
> +	struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +						  se_tpg);
> +
> +	return sysfs_emit(page, "%d\n", tpg->check_io_alignment);
> +}
> +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment);
> +
>  static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = {
>  	&vhost_scsi_tpg_attrib_attr_fabric_prot_type,
> +	&vhost_scsi_tpg_attrib_attr_check_io_alignment,
>  	NULL,
>  };
>  
> -- 
> 2.25.1
Mike Christie May 25, 2023, 3:43 p.m. UTC | #2
On 5/25/23 2:57 AM, Michael S. Tsirkin wrote:
> On Wed, May 24, 2023 at 06:34:05PM -0500, Mike Christie wrote:
>> The linux block layer requires bios/requests to have lengths with a 512
>> byte alignment. Some drivers/layers like dm-crypt and the directi IO code
>> will test for it and just fail. Other drivers like SCSI just assume the
>> requirement is met and will end up in infinte retry loops. The problem
>> for drivers like SCSI is that it uses functions like blk_rq_cur_sectors
>> and blk_rq_sectors which divide the request's length by 512. If there's
>> lefovers then it just gets dropped. But other code in the block/scsi
>> layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is
>> still data left and try to retry the cmd. We can then end up getting
>> stuck in retry loops where part of the block/scsi thinks there is data
>> left, but other parts think we want to do IOs of zero length.
>>
>> Linux will always check for alignment, but windows will not. When
>> vhost-scsi then translates the iovec it gets from a windows guest to a
>> scatterlist, we can end up with sg items where the sg->length is not
>> divisible by 512 due to the misaligned offset:
>>
>> sg[0].offset = 255;
>> sg[0].length = 3841;
>> sg...
>> sg[N].offset = 0;
>> sg[N].length = 255;
>>
>> When the lio backends then convert the SG to bios or other iovecs, we
>> end up sending them with the same misaligned values and can hit the
>> issues above.
>>
>> This just has us drop down to allocating a temp page and copying the data
>> when this happens. Because this adds a check that needs to loop over the
>> iovec in the main IO path, this patch adds an attribute that can be turned
>> on for just windows.
>>
>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> 
> I am assuming this triggers rarely, yes?
> 
> If so I would like to find a way to avoid setting an attribute.
> 
> I am guessing the main cost is an extra scan of iov.

The scan and a memory allocation to dup the iter. However, I see
iov_iter_revert and I think that might be what I needed before to
avoid the mem alloc so will try it out.


> vhost_scsi_iov_to_sgl already does a scan, how about changing
> it to fail if misaligned?
> We can then detect the relevant error code and try with a copy.
>
kernel test robot May 25, 2023, 7:05 p.m. UTC | #3
Hi Mike,

kernel test robot noticed the following build warnings:

[auto build test WARNING on mst-vhost/linux-next]
[also build test WARNING on linus/master v6.4-rc3 next-20230525]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Christie/vhost-scsi-Fix-alignment-handling-with-windows/20230525-073558
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
patch link:    https://lore.kernel.org/r/20230524233407.41432-2-michael.christie%40oracle.com
patch subject: [PATCH 1/3] vhost-scsi: Fix alignment handling with windows
config: hexagon-randconfig-r026-20230524
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 4faf3aaf28226a4e950c103a14f6fc1d1fdabb1b)
reproduce (this is a W=1 build):
        mkdir -p ~/bin
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/fe857a6d1a38ba4d85c7d9fa18c989324e1619b6
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Mike-Christie/vhost-scsi-Fix-alignment-handling-with-windows/20230525-073558
        git checkout fe857a6d1a38ba4d85c7d9fa18c989324e1619b6
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=hexagon olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash drivers/vhost/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305260230.QtlFScF3-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from drivers/vhost/scsi.c:28:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __raw_readb(PCI_IOBASE + addr);
                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
   #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
                                                     ^
   In file included from drivers/vhost/scsi.c:28:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
                                                     ^
   In file included from drivers/vhost/scsi.c:28:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/hexagon/include/asm/io.h:334:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
>> drivers/vhost/scsi.c:554:47: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
                           pr_err("Could not copy data. Error %lu\n", len);
                                                              ~~~     ^~~
                                                              %zu
   include/linux/printk.h:498:33: note: expanded from macro 'pr_err'
           printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
                                  ~~~     ^~~~~~~~~~~
   include/linux/printk.h:455:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:427:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
   7 warnings generated.


vim +554 drivers/vhost/scsi.c

   540	
   541	static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
   542	{
   543		struct iov_iter *iter = &cmd->saved_iter;
   544		struct scatterlist *sg = cmd->tvc_sgl;
   545		struct page *page;
   546		size_t len;
   547		int i;
   548	
   549		for (i = 0; i < cmd->tvc_sgl_count; i++) {
   550			page = sg_page(&sg[i]);
   551			len = sg[i].length;
   552	
   553			if (copy_page_to_iter(page, 0, len, iter) != len) {
 > 554				pr_err("Could not copy data. Error %lu\n", len);
   555				return -1;
   556			}
   557		}
   558	
   559		return 0;
   560	}
   561
diff mbox series

Patch

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index bb10fa4bb4f6..dbad8fb579eb 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -25,6 +25,8 @@ 
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
 #include <linux/miscdevice.h>
+#include <linux/blk_types.h>
+#include <linux/bio.h>
 #include <asm/unaligned.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -75,6 +77,9 @@  struct vhost_scsi_cmd {
 	u32 tvc_prot_sgl_count;
 	/* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */
 	u32 tvc_lun;
+	u32 copied_iov:1;
+	const void *saved_iter_addr;
+	struct iov_iter saved_iter;
 	/* Pointer to the SGL formatted memory from virtio-scsi */
 	struct scatterlist *tvc_sgl;
 	struct scatterlist *tvc_prot_sgl;
@@ -107,6 +112,7 @@  struct vhost_scsi_nexus {
 struct vhost_scsi_tpg {
 	/* Vhost port target portal group tag for TCM */
 	u16 tport_tpgt;
+	bool check_io_alignment;
 	/* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
 	int tv_tpg_port_count;
 	/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
@@ -328,8 +334,13 @@  static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
 	int i;
 
 	if (tv_cmd->tvc_sgl_count) {
-		for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
-			put_page(sg_page(&tv_cmd->tvc_sgl[i]));
+		for (i = 0; i < tv_cmd->tvc_sgl_count; i++) {
+			if (tv_cmd->copied_iov)
+				__free_page(sg_page(&tv_cmd->tvc_sgl[i]));
+			else
+				put_page(sg_page(&tv_cmd->tvc_sgl[i]));
+		}
+		kfree(tv_cmd->saved_iter_addr);
 	}
 	if (tv_cmd->tvc_prot_sgl_count) {
 		for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
@@ -502,6 +513,27 @@  static void vhost_scsi_evt_work(struct vhost_work *work)
 	mutex_unlock(&vq->mutex);
 }
 
+static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
+{
+	struct iov_iter *iter = &cmd->saved_iter;
+	struct scatterlist *sg = cmd->tvc_sgl;
+	struct page *page;
+	size_t len;
+	int i;
+
+	for (i = 0; i < cmd->tvc_sgl_count; i++) {
+		page = sg_page(&sg[i]);
+		len = sg[i].length;
+
+		if (copy_page_to_iter(page, 0, len, iter) != len) {
+			pr_err("Could not copy data. Error %lu\n", len);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
 /* Fill in status and signal that we are done processing this command
  *
  * This is scheduled in the vhost work queue so we are called with the owner
@@ -525,15 +557,20 @@  static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 
 		pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
 			cmd, se_cmd->residual_count, se_cmd->scsi_status);
-
 		memset(&v_rsp, 0, sizeof(v_rsp));
-		v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count);
-		/* TODO is status_qualifier field needed? */
-		v_rsp.status = se_cmd->scsi_status;
-		v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
-						 se_cmd->scsi_sense_length);
-		memcpy(v_rsp.sense, cmd->tvc_sense_buf,
-		       se_cmd->scsi_sense_length);
+
+		if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) {
+			v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
+		} else {
+			v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq,
+						     se_cmd->residual_count);
+			/* TODO is status_qualifier field needed? */
+			v_rsp.status = se_cmd->scsi_status;
+			v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
+							 se_cmd->scsi_sense_length);
+			memcpy(v_rsp.sense, cmd->tvc_sense_buf,
+			       se_cmd->scsi_sense_length);
+		}
 
 		iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov,
 			      cmd->tvc_in_iovs, sizeof(v_rsp));
@@ -682,7 +719,52 @@  vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
 }
 
 static int
-vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
+vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
+			   struct scatterlist *sg, int sg_count)
+{
+	size_t len = iov_iter_count(iter);
+	unsigned int nbytes = 0;
+	struct page *page;
+	int i;
+
+	if (cmd->tvc_data_direction == DMA_FROM_DEVICE) {
+		cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter,
+						GFP_KERNEL);
+		if (!cmd->saved_iter_addr)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < sg_count; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			i--;
+			goto err;
+		}
+
+		nbytes = min_t(unsigned int, PAGE_SIZE, len);
+		sg_set_page(&sg[i], page, nbytes, 0);
+
+		if (cmd->tvc_data_direction == DMA_TO_DEVICE &&
+		    copy_page_from_iter(page, 0, nbytes, iter) != nbytes)
+			goto err;
+
+		len -= nbytes;
+	}
+
+	cmd->copied_iov = 1;
+	return 0;
+
+err:
+	pr_err("Could not read %u bytes\n", nbytes);
+
+	for (; i >= 0; i--)
+		__free_page(sg_page(&sg[i]));
+	kfree(cmd->saved_iter_addr);
+	return -ENOMEM;
+}
+
+static int
+vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd,
 		 size_t prot_bytes, struct iov_iter *prot_iter,
 		 size_t data_bytes, struct iov_iter *data_iter)
 {
@@ -703,10 +785,8 @@  vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
 		ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter,
 					    cmd->tvc_prot_sgl,
 					    cmd->tvc_prot_sgl_count);
-		if (ret < 0) {
-			cmd->tvc_prot_sgl_count = 0;
-			return ret;
-		}
+		if (ret < 0)
+			goto map_fail;
 	}
 	sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes,
 					 VHOST_SCSI_PREALLOC_SGLS);
@@ -717,14 +797,32 @@  vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
 	cmd->tvc_sgl_count = sgl_count;
 	pr_debug("%s data_sg %p data_sgl_count %u\n", __func__,
 		  cmd->tvc_sgl, cmd->tvc_sgl_count);
+	/*
+	 * The block layer requires bios/requests to be a multiple of 512 bytes,
+	 * but Windows can send us vecs that are misaligned. This can result
+	 * in bios and later requests with misaligned sizes if we have to break
+	 * up a cmd into multiple bios.
+	 *
+	 * We currently only break up a command into multiple bios if we hit
+	 * the vec/seg limit, so check if our sgl_count is greater than the max
+	 * and if a vec in the cmd has a misaligned size.
+	 */
+	if (tpg->check_io_alignment &&
+	    (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) &&
+	     bio_max_segs(sgl_count) != sgl_count))
+		ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
+						 cmd->tvc_sgl_count);
+	else
+		ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
+					    cmd->tvc_sgl, cmd->tvc_sgl_count);
+	if (ret)
+		goto map_fail;
 
-	ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
-				    cmd->tvc_sgl, cmd->tvc_sgl_count);
-	if (ret < 0) {
-		cmd->tvc_sgl_count = 0;
-		return ret;
-	}
 	return 0;
+
+map_fail:
+	cmd->tvc_sgl_count = 0;
+	return ret;
 }
 
 static int vhost_scsi_to_tcm_attr(int attr)
@@ -1077,7 +1175,7 @@  vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			 " %d\n", cmd, exp_data_len, prot_bytes, data_direction);
 
 		if (data_direction != DMA_NONE) {
-			if (unlikely(vhost_scsi_mapal(cmd, prot_bytes,
+			if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes,
 						      &prot_iter, exp_data_len,
 						      &data_iter))) {
 				vq_err(vq, "Failed to map iov to sgl\n");
@@ -2068,11 +2166,41 @@  static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show(
 
 	return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type);
 }
-
 CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type);
 
+static ssize_t
+vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item,
+					       const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
+						  se_tpg);
+	bool val;
+	int ret;
+
+	ret = kstrtobool(page, &val);
+	if (ret)
+		return ret;
+
+	tpg->check_io_alignment = val;
+	return count;
+}
+
+static ssize_t
+vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item,
+					      char *page)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
+						  se_tpg);
+
+	return sysfs_emit(page, "%d\n", tpg->check_io_alignment);
+}
+CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment);
+
 static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = {
 	&vhost_scsi_tpg_attrib_attr_fabric_prot_type,
+	&vhost_scsi_tpg_attrib_attr_check_io_alignment,
 	NULL,
 };