diff mbox series

[v3,15/15] nvme: Ensure atomic writes will be executed atomically

Message ID 20240124113841.31824-16-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes | expand

Commit Message

John Garry Jan. 24, 2024, 11:38 a.m. UTC
From: Alan Adamson <alan.adamson@oracle.com>

There is no dedicated NVMe atomic write command (which may error for a
command which exceeds the controller atomic write limits).

As an insurance policy against the block layer sending requests which
cannot be executed atomically, add a check in the queue path.

Signed-off-by: Alan Adamson <alan.adamson@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/nvme/host/core.c | 35 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h |  2 ++
 2 files changed, 35 insertions(+), 2 deletions(-)

Comments

Keith Busch Jan. 25, 2024, 12:52 a.m. UTC | #1
On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote:
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 5045c84f2516..6a34a5d92088 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>  	if (req->cmd_flags & REQ_RAHEAD)
>  		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>  
> +	/*
> +	 * Ensure that nothing has been sent which cannot be executed
> +	 * atomically.
> +	 */
> +	if (req->cmd_flags & REQ_ATOMIC) {
> +		struct nvme_ns_head *head = ns->head;
> +		u32 boundary_bytes = head->atomic_boundary;
> +
> +		if (blk_rq_bytes(req) > ns->head->atomic_max)
> +			return BLK_STS_IOERR;
> +
> +		if (boundary_bytes) {
> +			u32 mask = boundary_bytes - 1, imask = ~mask;
> +			u32 start = blk_rq_pos(req) << SECTOR_SHIFT;
> +			u32 end = start + blk_rq_bytes(req);
> +
> +			if (blk_rq_bytes(req) > boundary_bytes)
> +				return BLK_STS_IOERR;
> +
> +			if (((start & imask) != (end & imask)) &&
> +			    (end & mask)) {
> +				return BLK_STS_IOERR;
> +			}
> +		}
> +	}

Aren't these new fields, atomic_max and atomic_boundary, duplicates of
the equivalent queue limits? Let's just use the queue limits instead.

And couldn't we generically validate the constraints are not violated in
submit_bio_noacct() instead of doing that in the low level driver? The
driver assumes all other requests are already sanity checked, so I don't
think we should change the responsibility for that just for this flag.
John Garry Jan. 25, 2024, 11:28 a.m. UTC | #2
On 25/01/2024 00:52, Keith Busch wrote:
> On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote:
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 5045c84f2516..6a34a5d92088 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>   	if (req->cmd_flags & REQ_RAHEAD)
>>   		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>   
>> +	/*
>> +	 * Ensure that nothing has been sent which cannot be executed
>> +	 * atomically.
>> +	 */
>> +	if (req->cmd_flags & REQ_ATOMIC) {
>> +		struct nvme_ns_head *head = ns->head;
>> +		u32 boundary_bytes = head->atomic_boundary;
>> +
>> +		if (blk_rq_bytes(req) > ns->head->atomic_max)
>> +			return BLK_STS_IOERR;
>> +
>> +		if (boundary_bytes) {
>> +			u32 mask = boundary_bytes - 1, imask = ~mask;
>> +			u32 start = blk_rq_pos(req) << SECTOR_SHIFT;
>> +			u32 end = start + blk_rq_bytes(req);
>> +
>> +			if (blk_rq_bytes(req) > boundary_bytes)
>> +				return BLK_STS_IOERR;
>> +
>> +			if (((start & imask) != (end & imask)) &&
>> +			    (end & mask)) {
>> +				return BLK_STS_IOERR;
>> +			}
>> +		}
>> +	}
> Aren't these new fields, atomic_max and atomic_boundary, duplicates of
> the equivalent queue limits? Let's just use the queue limits instead.

I was making a copy of these limits in the driver out of an abundance of 
caution. I suppose the atomic_max and atomic_boundary values won't be 
modified in the block layer, so I could use them instead.

> 
> And couldn't we generically validate the constraints are not violated in
> submit_bio_noacct() instead of doing that in the low level driver? The
> driver assumes all other requests are already sanity checked, so I don't
> think we should change the responsibility for that just for this flag.

As a safety mechanism, we want to ensure the complete stack is not 
giving us out-of-limits atomic writes.

We have limits checks in XFS iomap and fops.c, but we would also want to 
ensure that the the block layer is not doing anything it shouldn't be 
doing after submit_bio_noacct(), like merging atomic write BIOs which 
straddle a boundary or exceed atomic_max (if there were any merging).

The SCSI standard already has provision for error'ing an atomic write 
command which exceeds the target atomic write capabilities, while NVMe 
doesn't.

BTW, Christoph did mention that he would like to see this:
https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/

Thanks,
John
Chaitanya Kulkarni Jan. 26, 2024, 3:50 a.m. UTC | #3
On 1/24/2024 4:52 PM, Keith Busch wrote:
> On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote:
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 5045c84f2516..6a34a5d92088 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>   	if (req->cmd_flags & REQ_RAHEAD)
>>   		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>   
>> +	/*
>> +	 * Ensure that nothing has been sent which cannot be executed
>> +	 * atomically.
>> +	 */
>> +	if (req->cmd_flags & REQ_ATOMIC) {
>> +		struct nvme_ns_head *head = ns->head;
>> +		u32 boundary_bytes = head->atomic_boundary;
>> +
>> +		if (blk_rq_bytes(req) > ns->head->atomic_max)
>> +			return BLK_STS_IOERR;
>> +
>> +		if (boundary_bytes) {
>> +			u32 mask = boundary_bytes - 1, imask = ~mask;
>> +			u32 start = blk_rq_pos(req) << SECTOR_SHIFT;
>> +			u32 end = start + blk_rq_bytes(req);
>> +
>> +			if (blk_rq_bytes(req) > boundary_bytes)
>> +				return BLK_STS_IOERR;
>> +
>> +			if (((start & imask) != (end & imask)) &&
>> +			    (end & mask)) {
>> +				return BLK_STS_IOERR;
>> +			}
>> +		}
>> +	}
> 
> Aren't these new fields, atomic_max and atomic_boundary, duplicates of
> the equivalent queue limits? Let's just use the queue limits instead.
> 
> And couldn't we generically validate the constraints are not violated in
> submit_bio_noacct() instead of doing that in the low level driver? The
> driver assumes all other requests are already sanity checked, so I don't
> think we should change the responsibility for that just for this flag.
> 

does it makes sense to move about code to the helper ? perhaps inline ?

-ck
Christoph Hellwig Jan. 29, 2024, 6:20 a.m. UTC | #4
On Thu, Jan 25, 2024 at 11:28:22AM +0000, John Garry wrote:
> We have limits checks in XFS iomap and fops.c, but we would also want to 
> ensure that the the block layer is not doing anything it shouldn't be doing 
> after submit_bio_noacct(), like merging atomic write BIOs which straddle a 
> boundary or exceed atomic_max (if there were any merging).
>
> The SCSI standard already has provision for error'ing an atomic write 
> command which exceeds the target atomic write capabilities, while NVMe 
> doesn't.

Can you get Oracle to propose this for NVMe?  It always helps if these
suggestions come from a large buyer of NVMe equipment.

> BTW, Christoph did mention that he would like to see this:
> https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/

I can probably live with a sufficiently low-level block layer check.
John Garry Jan. 29, 2024, 9:36 a.m. UTC | #5
On 29/01/2024 06:20, Christoph Hellwig wrote:
> On Thu, Jan 25, 2024 at 11:28:22AM +0000, John Garry wrote:
>> We have limits checks in XFS iomap and fops.c, but we would also want to
>> ensure that the the block layer is not doing anything it shouldn't be doing
>> after submit_bio_noacct(), like merging atomic write BIOs which straddle a
>> boundary or exceed atomic_max (if there were any merging).
>>
>> The SCSI standard already has provision for error'ing an atomic write
>> command which exceeds the target atomic write capabilities, while NVMe
>> doesn't.
> 
> Can you get Oracle to propose this for NVMe?  It always helps if these
> suggestions come from a large buyer of NVMe equipment.

I'll let Martin comment on that.

> 
>> BTW, Christoph did mention that he would like to see this:
>> https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/
> 
> I can probably live with a sufficiently low-level block layer check.

That would probably be in blk_mq_dispatch_rq_list() for block drivers 
with .queue_rq set, but I would need to check for a good place for 
->queue_rqs . I can't imagine that we just want to inefficiently iter 
all rqs at the ->queue_rqs call point.

However considering the nature of this change, it is not a good sign 
that we/I need to check... I'd be more inclined to leave as is.

Thanks,
John
Christoph Hellwig Jan. 29, 2024, 2:39 p.m. UTC | #6
On Mon, Jan 29, 2024 at 09:36:38AM +0000, John Garry wrote:
> That would probably be in blk_mq_dispatch_rq_list() for block drivers with 
> .queue_rq set, but I would need to check for a good place for ->queue_rqs . 
> I can't imagine that we just want to inefficiently iter all rqs at the 
> ->queue_rqs call point.
>
> However considering the nature of this change, it is not a good sign that 
> we/I need to check... I'd be more inclined to leave as is.

Heh.  At least early on having the checks in one place in nvme makes
me feel easier for sure.  If we can easily use the block limits for
the checks we shouldn't have to keep duplicate values in nvme, though.
Christoph Hellwig Feb. 13, 2024, 6:42 a.m. UTC | #7
If we don't end up doing the checks in the block layer:

> +	/*
> +	 * Ensure that nothing has been sent which cannot be executed
> +	 * atomically.
> +	 */
> +	if (req->cmd_flags & REQ_ATOMIC) {
> +		struct nvme_ns_head *head = ns->head;
> +		u32 boundary_bytes = head->atomic_boundary;

... please split the checks into a helper.  And merge them into the
previous patch.
John Garry Feb. 13, 2024, 2:07 p.m. UTC | #8
On 13/02/2024 06:42, Christoph Hellwig wrote:
> If we don't end up doing the checks in the block layer:
> 
>> +	/*
>> +	 * Ensure that nothing has been sent which cannot be executed
>> +	 * atomically.
>> +	 */
>> +	if (req->cmd_flags & REQ_ATOMIC) {
>> +		struct nvme_ns_head *head = ns->head;
>> +		u32 boundary_bytes = head->atomic_boundary;
> 
> ... please split the checks into a helper. 

ok

> And merge them into the
> previous patch.

Fine, if you prefer that.

>
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5045c84f2516..6a34a5d92088 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -911,6 +911,32 @@  static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
+	/*
+	 * Ensure that nothing has been sent which cannot be executed
+	 * atomically.
+	 */
+	if (req->cmd_flags & REQ_ATOMIC) {
+		struct nvme_ns_head *head = ns->head;
+		u32 boundary_bytes = head->atomic_boundary;
+
+		if (blk_rq_bytes(req) > ns->head->atomic_max)
+			return BLK_STS_IOERR;
+
+		if (boundary_bytes) {
+			u32 mask = boundary_bytes - 1, imask = ~mask;
+			u32 start = blk_rq_pos(req) << SECTOR_SHIFT;
+			u32 end = start + blk_rq_bytes(req);
+
+			if (blk_rq_bytes(req) > boundary_bytes)
+				return BLK_STS_IOERR;
+
+			if (((start & imask) != (end & imask)) &&
+			    (end & mask)) {
+				return BLK_STS_IOERR;
+			}
+		}
+	}
+
 	cmnd->rw.opcode = op;
 	cmnd->rw.flags = 0;
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
@@ -1912,7 +1938,8 @@  static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 }
 
 static void nvme_update_atomic_write_disk_info(struct gendisk *disk,
-		struct nvme_ctrl *ctrl, struct nvme_id_ns *id, u32 bs, u32 atomic_bs)
+		struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
+		struct nvme_id_ns *id, u32 bs, u32 atomic_bs)
 {
 	unsigned int unit_min = 0, unit_max = 0, boundary = 0, max_bytes = 0;
 	struct request_queue *q = disk->queue;
@@ -1943,6 +1970,9 @@  static void nvme_update_atomic_write_disk_info(struct gendisk *disk,
 		unit_max = rounddown_pow_of_two(atomic_bs);
 	}
 
+	head->atomic_max = max_bytes;
+	head->atomic_boundary = boundary;
+
 	blk_queue_atomic_write_max_bytes(q, max_bytes);
 	blk_queue_atomic_write_unit_min_sectors(q, unit_min >> SECTOR_SHIFT);
 	blk_queue_atomic_write_unit_max_sectors(q, unit_max >> SECTOR_SHIFT);
@@ -1980,7 +2010,8 @@  static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
 		else
 			atomic_bs = (1 + ctrl->subsys->awupf) * bs;
 
-		nvme_update_atomic_write_disk_info(disk, ctrl, id, bs, atomic_bs);
+		nvme_update_atomic_write_disk_info(disk, ctrl, head, id,
+						   bs, atomic_bs);
 	}
 
 	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 030c80818240..938060c85e6f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -476,6 +476,8 @@  struct nvme_ns_head {
 	struct device		cdev_device;
 
 	struct gendisk		*disk;
+	u32 atomic_max;
+	u32 atomic_boundary;
 #ifdef CONFIG_NVME_MULTIPATH
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;