diff mbox series

[v12,5/9] nvme: add copy offload support

Message ID 20230605121732.28468-6-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v12,1/9] block: Introduce queue limits for copy-offload support | expand

Commit Message

Nitesh Shetty June 5, 2023, 12:17 p.m. UTC
For device supporting native copy, nvme driver receives read and
write request with BLK_COPY op flags.
For read request the nvme driver populates the payload with source
information.
For write request the driver converts it to nvme copy command using the
source information in the payload and submits to the device.
current design only supports single source range.
This design is courtesy Mikulas Patocka's token based copy

trace event support for nvme_copy_cmd.
Set the device copy limits to queue limits.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier González <javier.gonz@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/constants.c |   1 +
 drivers/nvme/host/core.c      | 103 +++++++++++++++++++++++++++++++++-
 drivers/nvme/host/fc.c        |   5 ++
 drivers/nvme/host/nvme.h      |   7 +++
 drivers/nvme/host/pci.c       |  27 ++++++++-
 drivers/nvme/host/rdma.c      |   7 +++
 drivers/nvme/host/tcp.c       |  16 ++++++
 drivers/nvme/host/trace.c     |  19 +++++++
 include/linux/nvme.h          |  43 +++++++++++++-
 9 files changed, 220 insertions(+), 8 deletions(-)

Comments

Christoph Hellwig June 5, 2023, 1:43 p.m. UTC | #1
>  		break;
>  	case REQ_OP_READ:
> -		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
> +		if (unlikely(req->cmd_flags & REQ_COPY))
> +			nvme_setup_copy_read(ns, req);
> +		else
> +			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>  		break;
>  	case REQ_OP_WRITE:
> -		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
> +		if (unlikely(req->cmd_flags & REQ_COPY))
> +			ret = nvme_setup_copy_write(ns, req, cmd);
> +		else
> +			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);

Yikes.  Overloading REQ_OP_READ and REQ_OP_WRITE with something entirely
different brings us back the horrors of the block layer 15 years ago.
Don't do that.  Please add separate REQ_COPY_IN/OUT (or maybe
SEND/RECEIVE or whatever) methods.

> +	/* setting copy limits */
> +	if (blk_queue_flag_test_and_set(QUEUE_FLAG_COPY, q))

I don't understand this comment.

> +struct nvme_copy_token {
> +	char *subsys;
> +	struct nvme_ns *ns;
> +	sector_t src_sector;
> +	sector_t sectors;
> +};

Why do we need a subsys token?  Inter-namespace copy is pretty crazy,
and not really anything we should aim for.  But this whole token design
is pretty odd anyway.  The only thing we'd need is a sequence number /
idr / etc to find an input and output side match up, as long as we
stick to the proper namespace scope.

> +	if (unlikely((req->cmd_flags & REQ_COPY) &&
> +				(req_op(req) == REQ_OP_READ))) {
> +		blk_mq_start_request(req);
> +		return BLK_STS_OK;
> +	}

This really needs to be hiden inside of nvme_setup_cmd.  And given
that other drivers might need similar handling the best way is probably
to have a new magic BLK_STS_* value for request started but we're
not actually sending it to hardware.
Nitesh Shetty June 6, 2023, 11:35 a.m. UTC | #2
On 23/06/05 06:43AM, Christoph Hellwig wrote:
>>  		break;
>>  	case REQ_OP_READ:
>> -		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>> +		if (unlikely(req->cmd_flags & REQ_COPY))
>> +			nvme_setup_copy_read(ns, req);
>> +		else
>> +			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>  		break;
>>  	case REQ_OP_WRITE:
>> -		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>> +		if (unlikely(req->cmd_flags & REQ_COPY))
>> +			ret = nvme_setup_copy_write(ns, req, cmd);
>> +		else
>> +			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>
>Yikes.  Overloading REQ_OP_READ and REQ_OP_WRITE with something entirely
>different brings us back the horrors of the block layer 15 years ago.
>Don't do that.  Please add separate REQ_COPY_IN/OUT (or maybe
>SEND/RECEIVE or whatever) methods.
>

Downside will be duplicating checks which are present for read, write in
block layer, device-mapper and zoned devices.
But we can do this, shouldn't be an issue.

>> +	/* setting copy limits */
>> +	if (blk_queue_flag_test_and_set(QUEUE_FLAG_COPY, q))
>
>I don't understand this comment.
>

It was a mistake. Comment is misplaced and it should have been
"setting copy flag" instead of "setting copy limits".
Anyway now we feel this comment is redundant, will remove it.
Also, we should have used blk_queue_flag_set to enable copy offload.

>> +struct nvme_copy_token {
>> +	char *subsys;
>> +	struct nvme_ns *ns;
>> +	sector_t src_sector;
>> +	sector_t sectors;
>> +};
>
>Why do we need a subsys token?  Inter-namespace copy is pretty crazy,
>and not really anything we should aim for.  But this whole token design
>is pretty odd anyway.  The only thing we'd need is a sequence number /
>idr / etc to find an input and output side match up, as long as we
>stick to the proper namespace scope.
>

The idea behind subsys is to prevent copy across different subsystem.
For example, copy across nvme subsystem and the scsi subsystem. [1]
At present, we don't support inter-namespace(copy across NVMe namespace),
but after community feedback for previous series we left scope for it.
About idr per namespace, it will be similar to namespace check that
we are doing to prevent copy across namespace.
We went with current structure for token, as it was solving above
issues as well as provides a placeholder for storing source LBA and
number of sectors.
Do have any suggestions on how we can store source info, if we go with
idr based approach ?

[1] https://lore.kernel.org/all/alpine.LRH.2.02.2202011327350.22481@file01.intranet.prod.int.rdu2.redhat.com/T/#m407f24fb4454d35c3283a5e51fdb04f1600463af

>> +	if (unlikely((req->cmd_flags & REQ_COPY) &&
>> +				(req_op(req) == REQ_OP_READ))) {
>> +		blk_mq_start_request(req);
>> +		return BLK_STS_OK;
>> +	}
>
>This really needs to be hiden inside of nvme_setup_cmd.  And given
>that other drivers might need similar handling the best way is probably
>to have a new magic BLK_STS_* value for request started but we're
>not actually sending it to hardware.

Sure we will add new BLK_STS_* for completion and move the snippet.

Thank you,
Nitesh Shetty
Christoph Hellwig June 7, 2023, 7:12 a.m. UTC | #3
On Tue, Jun 06, 2023 at 05:05:35PM +0530, Nitesh Shetty wrote:
> Downside will be duplicating checks which are present for read, write in
> block layer, device-mapper and zoned devices.
> But we can do this, shouldn't be an issue.

Yes.  Please never overload operations, this is just causing problems
everywhere, and that why I split the operations from the flag a few
years ago.

> The idea behind subsys is to prevent copy across different subsystem.
> For example, copy across nvme subsystem and the scsi subsystem. [1]
> At present, we don't support inter-namespace(copy across NVMe namespace),
> but after community feedback for previous series we left scope for it.

Never leave scope for something that isn't actually added.  That just
creates a giant maintainance nightmare.  Cross-device copies are giant
nightmare in general, and in the case of NVMe completely unusable
as currently done in the working group.  Messing up something that
is entirely reasonable (local copy) for something like that is a sure
way to never get this series in.
Martin K. Petersen June 8, 2023, 1:36 a.m. UTC | #4
Christoph,

> Yikes. Overloading REQ_OP_READ and REQ_OP_WRITE with something
> entirely different brings us back the horrors of the block layer 15
> years ago. Don't do that. Please add separate REQ_COPY_IN/OUT (or
> maybe SEND/RECEIVE or whatever) methods.

I agree, I used REQ_COPY_IN and REQ_COPY_OUT in my original series.

>> +	/* setting copy limits */
>> +	if (blk_queue_flag_test_and_set(QUEUE_FLAG_COPY, q))
>
> I don't understand this comment.
>
>> +struct nvme_copy_token {
>> +	char *subsys;
>> +	struct nvme_ns *ns;
>> +	sector_t src_sector;
>> +	sector_t sectors;
>> +};
>
> Why do we need a subsys token? Inter-namespace copy is pretty crazy,
> and not really anything we should aim for. But this whole token design
> is pretty odd anyway. The only thing we'd need is a sequence number /
> idr / etc to find an input and output side match up, as long as we
> stick to the proper namespace scope.

Yeah, I don't think we need to carry this in a token. Doing the sanity
check up front in blkdev_copy_offload() should be fine. For NVMe it's
not currently possible to copy across and for SCSI we'd just make sure
the copy scope is the same for the two block devices before we even
issue the operations.
Nitesh Shetty June 8, 2023, 12:08 p.m. UTC | #5
Hi Christoph and Martin,

On 23/06/07 12:12AM, Christoph Hellwig wrote:
>On Tue, Jun 06, 2023 at 05:05:35PM +0530, Nitesh Shetty wrote:
>> Downside will be duplicating checks which are present for read, write in
>> block layer, device-mapper and zoned devices.
>> But we can do this, shouldn't be an issue.
>
>Yes.  Please never overload operations, this is just causing problems
>everywhere, and that why I split the operations from the flag a few
>years ago.
>

Sure, we will add REQ_COPY_IN/OUT and send a new version.

>> The idea behind subsys is to prevent copy across different subsystem.
>> For example, copy across nvme subsystem and the scsi subsystem. [1]
>> At present, we don't support inter-namespace(copy across NVMe namespace),
>> but after community feedback for previous series we left scope for it.
>
>Never leave scope for something that isn't actually added.  That just
>creates a giant maintainance nightmare.  Cross-device copies are giant
>nightmare in general, and in the case of NVMe completely unusable
>as currently done in the working group.  Messing up something that
>is entirely reasonable (local copy) for something like that is a sure
>way to never get this series in.

Sure, we can do away with subsys and realign more on single namespace copy.
We are planning to use token to store source info, such as src sector,
len and namespace. Something like below,

struct nvme_copy_token {
	struct nvme_ns *ns; // to make sure we are copying within same namespace
/* store source info during *IN operation, will be used by *OUT operation */
	sector_t src_sector;
	sector_t sectors;
};
Do you have any better way to handle this in mind ?


Thank you,
Nitesh Shetty
Christoph Hellwig June 9, 2023, 4:24 a.m. UTC | #6
On Thu, Jun 08, 2023 at 05:38:17PM +0530, Nitesh Shetty wrote:
> Sure, we can do away with subsys and realign more on single namespace copy.
> We are planning to use token to store source info, such as src sector,
> len and namespace. Something like below,
> 
> struct nvme_copy_token {
> 	struct nvme_ns *ns; // to make sure we are copying within same namespace
> /* store source info during *IN operation, will be used by *OUT operation */
> 	sector_t src_sector;
> 	sector_t sectors;
> };
> Do you have any better way to handle this in mind ?

In general every time we tried to come up with a request payload that is
not just data passed to the device it has been a nightmare.

So my gut feeling would be that bi_sector and bi_iter.bi_size are the
ranges, with multiple bios being allowed to form the input data, similar
to how we implement discard merging.

The interesting part is how we'd match up these bios.  One idea would
be that since copy by definition doesn't need integrity data we just
add a copy_id that unions it, and use a simple per-gendisk copy I/D
allocator, but I'm not entirely sure how well that interacts stacking
drivers.
Nitesh Shetty July 10, 2023, 6:14 a.m. UTC | #7
On 23/06/08 09:24PM, Christoph Hellwig wrote:
>On Thu, Jun 08, 2023 at 05:38:17PM +0530, Nitesh Shetty wrote:
>> Sure, we can do away with subsys and realign more on single namespace copy.
>> We are planning to use token to store source info, such as src sector,
>> len and namespace. Something like below,
>>
>> struct nvme_copy_token {
>> 	struct nvme_ns *ns; // to make sure we are copying within same namespace
>> /* store source info during *IN operation, will be used by *OUT operation */
>> 	sector_t src_sector;
>> 	sector_t sectors;
>> };
>> Do you have any better way to handle this in mind ?
>
>In general every time we tried to come up with a request payload that is
>not just data passed to the device it has been a nightmare.
>
>So my gut feeling would be that bi_sector and bi_iter.bi_size are the
>ranges, with multiple bios being allowed to form the input data, similar
>to how we implement discard merging.
>
>The interesting part is how we'd match up these bios.  One idea would
>be that since copy by definition doesn't need integrity data we just
>add a copy_id that unions it, and use a simple per-gendisk copy I/D
>allocator, but I'm not entirely sure how well that interacts stacking
>drivers.

V13[1] implements that route. Please see if that matches with what you had
in mind?

[1] https://lore.kernel.org/linux-nvme/20230627183629.26571-1-nj.shetty@samsung.com/

Thank you, 
Nitesh Shetty
diff mbox series

Patch

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 5e4f8848dce0..311ad67e9cf3 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -19,6 +19,7 @@  static const char * const nvme_ops[] = {
 	[nvme_cmd_resv_report] = "Reservation Report",
 	[nvme_cmd_resv_acquire] = "Reservation Acquire",
 	[nvme_cmd_resv_release] = "Reservation Release",
+	[nvme_cmd_copy] = "Copy Offload",
 	[nvme_cmd_zone_mgmt_send] = "Zone Management Send",
 	[nvme_cmd_zone_mgmt_recv] = "Zone Management Receive",
 	[nvme_cmd_zone_append] = "Zone Append",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1715a508496c..ce1fec07dda6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -763,6 +763,77 @@  static inline void nvme_setup_flush(struct nvme_ns *ns,
 	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
 
+static inline void nvme_setup_copy_read(struct nvme_ns *ns, struct request *req)
+{
+	struct bio *bio = req->bio;
+	struct nvme_copy_token *token = bvec_kmap_local(&bio->bi_io_vec[0]);
+
+	token->subsys = "nvme";
+	token->ns = ns;
+	token->src_sector = bio->bi_iter.bi_sector;
+	token->sectors = bio->bi_iter.bi_size >> 9;
+}
+
+static inline blk_status_t nvme_setup_copy_write(struct nvme_ns *ns,
+	       struct request *req, struct nvme_command *cmnd)
+{
+	struct nvme_copy_range *range = NULL;
+	struct bio *bio = req->bio;
+	struct nvme_copy_token *token = bvec_kmap_local(&bio->bi_io_vec[0]);
+	sector_t src_sector, dst_sector, n_sectors;
+	u64 src_lba, dst_lba, n_lba;
+	unsigned short nr_range = 1;
+	u16 control = 0;
+
+	if (unlikely(memcmp(token->subsys, "nvme", 4)))
+		return BLK_STS_NOTSUPP;
+	if (unlikely(token->ns != ns))
+		return BLK_STS_NOTSUPP;
+
+	src_sector = token->src_sector;
+	dst_sector = bio->bi_iter.bi_sector;
+	n_sectors = token->sectors;
+	if (WARN_ON(n_sectors != bio->bi_iter.bi_size >> 9))
+		return BLK_STS_NOTSUPP;
+
+	src_lba = nvme_sect_to_lba(ns, src_sector);
+	dst_lba = nvme_sect_to_lba(ns, dst_sector);
+	n_lba = nvme_sect_to_lba(ns, n_sectors);
+
+	if (WARN_ON(!n_lba))
+		return BLK_STS_NOTSUPP;
+
+	if (req->cmd_flags & REQ_FUA)
+		control |= NVME_RW_FUA;
+
+	if (req->cmd_flags & REQ_FAILFAST_DEV)
+		control |= NVME_RW_LR;
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->copy.opcode = nvme_cmd_copy;
+	cmnd->copy.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->copy.sdlba = cpu_to_le64(dst_lba);
+
+	range = kmalloc_array(nr_range, sizeof(*range),
+			GFP_ATOMIC | __GFP_NOWARN);
+	if (!range)
+		return BLK_STS_RESOURCE;
+
+	range[0].slba = cpu_to_le64(src_lba);
+	range[0].nlb = cpu_to_le16(n_lba - 1);
+
+	cmnd->copy.nr_range = 0;
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * nr_range;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	cmnd->copy.control = cpu_to_le16(control);
+
+	return BLK_STS_OK;
+}
+
 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmnd)
 {
@@ -997,10 +1068,16 @@  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
 		ret = nvme_setup_discard(ns, req, cmd);
 		break;
 	case REQ_OP_READ:
-		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+		if (unlikely(req->cmd_flags & REQ_COPY))
+			nvme_setup_copy_read(ns, req);
+		else
+			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
 		break;
 	case REQ_OP_WRITE:
-		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		if (unlikely(req->cmd_flags & REQ_COPY))
+			ret = nvme_setup_copy_write(ns, req, cmd);
+		else
+			ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
 		break;
 	case REQ_OP_ZONE_APPEND:
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
@@ -1742,6 +1819,26 @@  static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
 }
 
+static void nvme_config_copy(struct gendisk *disk, struct nvme_ns *ns,
+				       struct nvme_id_ns *id)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *q = disk->queue;
+
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_COPY)) {
+		blk_queue_max_copy_sectors_hw(q, 0);
+		blk_queue_flag_clear(QUEUE_FLAG_COPY, q);
+		return;
+	}
+
+	/* setting copy limits */
+	if (blk_queue_flag_test_and_set(QUEUE_FLAG_COPY, q))
+		return;
+
+	blk_queue_max_copy_sectors_hw(q,
+		nvme_lba_to_sect(ns, le16_to_cpu(id->mssrl)));
+}
+
 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 {
 	return uuid_equal(&a->uuid, &b->uuid) &&
@@ -1941,6 +2038,7 @@  static void nvme_update_disk_info(struct gendisk *disk,
 	set_capacity_and_notify(disk, capacity);
 
 	nvme_config_discard(disk, ns);
+	nvme_config_copy(disk, ns, id);
 	blk_queue_max_write_zeroes_sectors(disk->queue,
 					   ns->ctrl->max_zeroes_sectors);
 }
@@ -5244,6 +5342,7 @@  static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_copy_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 691f2df574ce..702965f5047f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2807,6 +2807,11 @@  nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ret)
 		return ret;
 
+	if (unlikely((rq->cmd_flags & REQ_COPY) &&
+				(req_op(rq) == REQ_OP_READ))) {
+		blk_mq_end_request(rq, BLK_STS_OK);
+		return BLK_STS_OK;
+	}
 	/*
 	 * nvme core doesn't quite treat the rq opaquely. Commands such
 	 * as WRITE ZEROES will return a non-zero rq payload_bytes yet
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8f07aee68d59..a10bb80faa9c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -508,6 +508,13 @@  struct nvme_ns {
 
 };
 
+struct nvme_copy_token {
+	char *subsys;
+	struct nvme_ns *ns;
+	sector_t src_sector;
+	sector_t sectors;
+};
+
 /* NVMe ns supports metadata actions by the controller (generate/strip) */
 static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
 {
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 492f319ebdf3..c231565b69cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -495,16 +495,19 @@  static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq,
 		nvmeq->sq_tail = 0;
 }
 
-static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+static inline void nvme_commit_sq_db(struct nvme_queue *nvmeq)
 {
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
 	spin_lock(&nvmeq->sq_lock);
 	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
 		nvme_write_sq_db(nvmeq, true);
 	spin_unlock(&nvmeq->sq_lock);
 }
 
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	nvme_commit_sq_db(hctx->driver_data);
+}
+
 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 				     int nseg)
 {
@@ -848,6 +851,12 @@  static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 	if (ret)
 		return ret;
 
+	if (unlikely((req->cmd_flags & REQ_COPY) &&
+				(req_op(req) == REQ_OP_READ))) {
+		blk_mq_start_request(req);
+		return BLK_STS_OK;
+	}
+
 	if (blk_rq_nr_phys_segments(req)) {
 		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
@@ -894,6 +903,18 @@  static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	ret = nvme_prep_rq(dev, req);
 	if (unlikely(ret))
 		return ret;
+	if (unlikely((req->cmd_flags & REQ_COPY) &&
+				(req_op(req) == REQ_OP_READ))) {
+		blk_mq_set_request_complete(req);
+		blk_mq_end_request(req, BLK_STS_OK);
+		/* Commit the sq if copy read was the last req in the list,
+		 * as copy read deoesn't update sq db
+		 */
+		if (bd->last)
+			nvme_commit_sq_db(nvmeq);
+		return ret;
+	}
+
 	spin_lock(&nvmeq->sq_lock);
 	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
 	nvme_write_sq_db(nvmeq, bd->last);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..be1d20ac8bb0 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2038,6 +2038,13 @@  static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	nvme_start_request(rq);
 
+	if (unlikely((rq->cmd_flags & REQ_COPY) &&
+				(req_op(rq) == REQ_OP_READ))) {
+		blk_mq_end_request(rq, BLK_STS_OK);
+		ret = BLK_STS_OK;
+		goto unmap_qe;
+	}
+
 	if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
 	    queue->pi_support &&
 	    (c->common.opcode == nvme_cmd_write ||
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..5ba1bb35c557 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2373,6 +2373,11 @@  static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 	if (ret)
 		return ret;
 
+	if (unlikely((rq->cmd_flags & REQ_COPY) &&
+				(req_op(rq) == REQ_OP_READ))) {
+		return BLK_STS_OK;
+	}
+
 	req->state = NVME_TCP_SEND_CMD_PDU;
 	req->status = cpu_to_le16(NVME_SC_SUCCESS);
 	req->offset = 0;
@@ -2441,6 +2446,17 @@  static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	nvme_start_request(rq);
 
+	if (unlikely((rq->cmd_flags & REQ_COPY) &&
+				(req_op(rq) == REQ_OP_READ))) {
+		blk_mq_set_request_complete(rq);
+		blk_mq_end_request(rq, BLK_STS_OK);
+		/* if copy read is the last req queue tcp reqs */
+		if (bd->last && nvme_tcp_queue_more(queue))
+			queue_work_on(queue->io_cpu, nvme_tcp_wq,
+					&queue->io_work);
+		return ret;
+	}
+
 	nvme_tcp_queue_request(req, true, bd->last);
 
 	return BLK_STS_OK;
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 1c36fcedea20..da4a7494e5a7 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -150,6 +150,23 @@  static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
 	return ret;
 }
 
+static const char *nvme_trace_copy(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u8 nr_range = get_unaligned_le16(cdw10 + 8);
+	u16 control = get_unaligned_le16(cdw10 + 10);
+	u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+	u32 reftag = get_unaligned_le32(cdw10 +  16);
+
+	trace_seq_printf(p,
+			 "slba=%llu, nr_range=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
+			 slba, nr_range, control, dsmgmt, reftag);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
 static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -243,6 +260,8 @@  const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
 		return nvme_trace_zone_mgmt_send(p, cdw10);
 	case nvme_cmd_zone_mgmt_recv:
 		return nvme_trace_zone_mgmt_recv(p, cdw10);
+	case nvme_cmd_copy:
+		return nvme_trace_copy(p, cdw10);
 	default:
 		return nvme_trace_common(p, cdw10);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 182b6d614eb1..bbd877111b57 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -337,7 +337,7 @@  struct nvme_id_ctrl {
 	__u8			nvscc;
 	__u8			nwpc;
 	__le16			acwu;
-	__u8			rsvd534[2];
+	__le16			ocfs;
 	__le32			sgls;
 	__le32			mnan;
 	__u8			rsvd544[224];
@@ -365,6 +365,7 @@  enum {
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_ONCS_RESERVATIONS		= 1 << 5,
 	NVME_CTRL_ONCS_TIMESTAMP		= 1 << 6,
+	NVME_CTRL_ONCS_COPY			= 1 << 8,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 	NVME_CTRL_OACS_NS_MNGT_SUPP		= 1 << 3,
@@ -414,7 +415,10 @@  struct nvme_id_ns {
 	__le16			npdg;
 	__le16			npda;
 	__le16			nows;
-	__u8			rsvd74[18];
+	__le16			mssrl;
+	__le32			mcl;
+	__u8			msrc;
+	__u8			rsvd91[11];
 	__le32			anagrpid;
 	__u8			rsvd96[3];
 	__u8			nsattr;
@@ -831,6 +835,7 @@  enum nvme_opcode {
 	nvme_cmd_resv_report	= 0x0e,
 	nvme_cmd_resv_acquire	= 0x11,
 	nvme_cmd_resv_release	= 0x15,
+	nvme_cmd_copy		= 0x19,
 	nvme_cmd_zone_mgmt_send	= 0x79,
 	nvme_cmd_zone_mgmt_recv	= 0x7a,
 	nvme_cmd_zone_append	= 0x7d,
@@ -854,7 +859,8 @@  enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_resv_release),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_send),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_recv),	\
-		nvme_opcode_name(nvme_cmd_zone_append))
+		nvme_opcode_name(nvme_cmd_zone_append),		\
+		nvme_opcode_name(nvme_cmd_copy))
 
 
 
@@ -1031,6 +1037,36 @@  struct nvme_dsm_range {
 	__le64			slba;
 };
 
+struct nvme_copy_command {
+	__u8                    opcode;
+	__u8                    flags;
+	__u16                   command_id;
+	__le32                  nsid;
+	__u64                   rsvd2;
+	__le64                  metadata;
+	union nvme_data_ptr     dptr;
+	__le64                  sdlba;
+	__u8			nr_range;
+	__u8			rsvd12;
+	__le16                  control;
+	__le16                  rsvd13;
+	__le16			dspec;
+	__le32                  ilbrt;
+	__le16                  lbat;
+	__le16                  lbatm;
+};
+
+struct nvme_copy_range {
+	__le64			rsvd0;
+	__le64			slba;
+	__le16			nlb;
+	__le16			rsvd18;
+	__le32			rsvd20;
+	__le32			eilbrt;
+	__le16			elbat;
+	__le16			elbatm;
+};
+
 struct nvme_write_zeroes_cmd {
 	__u8			opcode;
 	__u8			flags;
@@ -1792,6 +1828,7 @@  struct nvme_command {
 		struct nvme_download_firmware dlfw;
 		struct nvme_format_cmd format;
 		struct nvme_dsm_cmd dsm;
+		struct nvme_copy_command copy;
 		struct nvme_write_zeroes_cmd write_zeroes;
 		struct nvme_zone_mgmt_send_cmd zms;
 		struct nvme_zone_mgmt_recv_cmd zmr;