diff mbox series

[21/21] nvme: Support atomic writes

Message ID 20230929102726.2985188-22-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes | expand

Commit Message

John Garry Sept. 29, 2023, 10:27 a.m. UTC
From: Alan Adamson <alan.adamson@oracle.com>

Support reading atomic write registers to fill in request_queue
properties.

Use following method to calculate limits:
atomic_write_max_bytes = flp2(NAWUPF ?: AWUPF)
atomic_write_unit_min = logical_block_size
atomic_write_unit_max = flp2(NAWUPF ?: AWUPF)
atomic_write_boundary = NABSPF

Signed-off-by: Alan Adamson <alan.adamson@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/nvme/host/core.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

Comments

Pankaj Raghav Oct. 4, 2023, 11:39 a.m. UTC | #1
> +++ b/drivers/nvme/host/core.c
> @@ -1926,6 +1926,35 @@ static void nvme_update_disk_info(struct gendisk *disk,
>  	blk_queue_io_min(disk->queue, phys_bs);
>  	blk_queue_io_opt(disk->queue, io_opt);
>  
> +	atomic_bs = rounddown_pow_of_two(atomic_bs);
> +	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
> +		if (id->nabo) {
> +			dev_err(ns->ctrl->device, "Support atomic NABO=%x\n",
> +				id->nabo);
> +		} else {
> +			u32 boundary = 0;
> +
> +			if (le16_to_cpu(id->nabspf))
> +				boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
> +
> +			if (is_power_of_2(boundary) || !boundary) {
> +				blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
> +				blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
> +				blk_queue_atomic_write_unit_max_sectors(disk->queue,
> +									atomic_bs / bs);
blk_queue_atomic_write_unit_[min| max]_sectors expects sectors (512 bytes unit)
as input but no conversion is done here from device logical block size
to SECTORs.
> +				blk_queue_atomic_write_boundary_bytes(disk->queue, boundary);
> +			} else {
> +				dev_err(ns->ctrl->device, "Unsupported atomic boundary=0x%x\n",
> +					boundary);
> +			}
>
John Garry Oct. 5, 2023, 10:24 a.m. UTC | #2
On 04/10/2023 12:39, Pankaj Raghav wrote:
>> +++ b/drivers/nvme/host/core.c
>> @@ -1926,6 +1926,35 @@ static void nvme_update_disk_info(struct gendisk *disk,
>>   	blk_queue_io_min(disk->queue, phys_bs);
>>   	blk_queue_io_opt(disk->queue, io_opt);
>>   
>> +	atomic_bs = rounddown_pow_of_two(atomic_bs);
>> +	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
>> +		if (id->nabo) {
>> +			dev_err(ns->ctrl->device, "Support atomic NABO=%x\n",
>> +				id->nabo);
>> +		} else {
>> +			u32 boundary = 0;
>> +
>> +			if (le16_to_cpu(id->nabspf))
>> +				boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
>> +
>> +			if (is_power_of_2(boundary) || !boundary) {

note to self/Alan: boundary just needs to be multiple of atomic write 
unit max, and not necessarily a power-of-2

>> +				blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
>> +				blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
>> +				blk_queue_atomic_write_unit_max_sectors(disk->queue,
>> +									atomic_bs / bs);
> blk_queue_atomic_write_unit_[min| max]_sectors expects sectors (512 bytes unit)
> as input but no conversion is done here from device logical block size
> to SECTORs.

Yeah, you are right. I think that we can just use:

blk_queue_atomic_write_unit_max_sectors(disk->queue,
atomic_bs >> SECTOR_SHIFT);

Thanks,
John

>> +				blk_queue_atomic_write_boundary_bytes(disk->queue, boundary);
>> +			} else {
>> +				dev_err(ns->ctrl->device, "Unsupported atomic boundary=0x%x\n",
>> +					boundary);
>> +			}
>>
Pankaj Raghav Oct. 5, 2023, 1:32 p.m. UTC | #3
>>> +                blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
>>> +                blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
>>> +                blk_queue_atomic_write_unit_max_sectors(disk->queue,
>>> +                                    atomic_bs / bs);
>> blk_queue_atomic_write_unit_[min| max]_sectors expects sectors (512 bytes unit)
>> as input but no conversion is done here from device logical block size
>> to SECTORs.
> 
> Yeah, you are right. I think that we can just use:
> 
> blk_queue_atomic_write_unit_max_sectors(disk->queue,
> atomic_bs >> SECTOR_SHIFT);
> 

Makes sense.
I still don't grok the difference between max_bytes and unit_max_sectors here.
(Maybe NVMe spec does not differentiate it?)

I assume min_sectors should be as follows instead of setting it to 1 (512 bytes)?

blk_queue_atomic_write_unit_min_sectors(disk->queue, bs >> SECTORS_SHIFT);


> Thanks,
> John
> 
>>> +                blk_queue_atomic_write_boundary_bytes(disk->queue, boundary);
>>> +            } else {
>>> +                dev_err(ns->ctrl->device, "Unsupported atomic boundary=0x%x\n",
>>> +                    boundary);
>>> +            }
>>>
>
John Garry Oct. 5, 2023, 3:05 p.m. UTC | #4
On 05/10/2023 14:32, Pankaj Raghav wrote:
>>> te_unit_[min| max]_sectors expects sectors (512 bytes unit)
>>> as input but no conversion is done here from device logical block size
>>> to SECTORs.
>> Yeah, you are right. I think that we can just use:
>>
>> blk_queue_atomic_write_unit_max_sectors(disk->queue,
>> atomic_bs >> SECTOR_SHIFT);
>>
> Makes sense.
> I still don't grok the difference between max_bytes and unit_max_sectors here.
> (Maybe NVMe spec does not differentiate it?)

I think that max_bytes does not need to be a power-of-2 and could be 
relaxed.

Having said that, max_bytes comes into play for merging of bios - so if 
we are in a scenario with no merging, then may a well leave 
atomic_write_max_bytes == atomic_write_unit_max.

But let us check this proposal to relax.

> 
> I assume min_sectors should be as follows instead of setting it to 1 (512 bytes)?
> 
> blk_queue_atomic_write_unit_min_sectors(disk->queue, bs >> SECTORS_SHIFT);

Yeah, right, we want unit_min to be the logical block size.

Thanks,
John
Christoph Hellwig Nov. 9, 2023, 3:36 p.m. UTC | #5
> +			if (le16_to_cpu(id->nabspf))
> +				boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
> +
> +			if (is_power_of_2(boundary) || !boundary) {
> +				blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
> +				blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
> +				blk_queue_atomic_write_unit_max_sectors(disk->queue,
> +									atomic_bs / bs);
> +				blk_queue_atomic_write_boundary_bytes(disk->queue, boundary);
> +			} else {
> +				dev_err(ns->ctrl->device, "Unsupported atomic boundary=0x%x\n",
> +					boundary);
> +			}

Please figure out a way to split the atomic configuration into a
helper and avoid all those crazy long lines,  preferable also avoid
the double calls to the block helpers as well while you're at it.

Also I really want a check in the NVMe I/O path that any request
with the atomic flag set actually adhers to the limits to at least
partially paper over the annoying lack of a separate write atomic
command in nvme.
Matthew Wilcox (Oracle) Nov. 9, 2023, 3:42 p.m. UTC | #6
On Thu, Nov 09, 2023 at 04:36:03PM +0100, Christoph Hellwig wrote:
> Also I really want a check in the NVMe I/O path that any request
> with the atomic flag set actually adhers to the limits to at least
> partially paper over the annoying lack of a separate write atomic
> command in nvme.

That wasn't the model we had in mind.  In our thinking, it was fine to
send a write that crossed the atomic write limit, but the drive wouldn't
guarantee that it was atomic except at the atomic write boundary.
Eg with an AWUN of 16kB, you could send five 16kB writes, combine them
into a single 80kB write, and if the power failed midway through, the
drive would guarantee that it had written 0, 16kB, 32kB, 48kB, 64kB or
all 80kB.  Not necessarily in order; it might have written bytes 16-32kB,
64-80kB and not the other three.
Christoph Hellwig Nov. 9, 2023, 3:46 p.m. UTC | #7
On Thu, Nov 09, 2023 at 03:42:40PM +0000, Matthew Wilcox wrote:
> That wasn't the model we had in mind.  In our thinking, it was fine to
> send a write that crossed the atomic write limit, but the drive wouldn't
> guarantee that it was atomic except at the atomic write boundary.
> Eg with an AWUN of 16kB, you could send five 16kB writes, combine them
> into a single 80kB write, and if the power failed midway through, the
> drive would guarantee that it had written 0, 16kB, 32kB, 48kB, 64kB or
> all 80kB.  Not necessarily in order; it might have written bytes 16-32kB,
> 64-80kB and not the other three.

I can see some use for that, but I'm really worried that debugging
problems in the I/O merging and splitting will be absolute hell.
John Garry Nov. 9, 2023, 7:08 p.m. UTC | #8
On 09/11/2023 15:46, Christoph Hellwig wrote:
> On Thu, Nov 09, 2023 at 03:42:40PM +0000, Matthew Wilcox wrote:
>> That wasn't the model we had in mind.  In our thinking, it was fine to
>> send a write that crossed the atomic write limit, but the drive wouldn't
>> guarantee that it was atomic except at the atomic write boundary.
>> Eg with an AWUN of 16kB, you could send five 16kB writes, combine them
>> into a single 80kB write, and if the power failed midway through, the
>> drive would guarantee that it had written 0, 16kB, 32kB, 48kB, 64kB or
>> all 80kB.  Not necessarily in order; it might have written bytes 16-32kB,
>> 64-80kB and not the other three.

I didn't think that there are any atomic write guarantees at all if we 
ever exceed AWUN or AWUPF or cross the atomic write boundary (if any).

> I can see some use for that, but I'm really worried that debugging
> problems in the I/O merging and splitting will be absolute hell.

Even if bios were merged for NVMe the total request length still should 
not exceed AWUPF. However a check can be added to ensure this for a 
submitted atomic write request.

As for splitting, it is not permitted for atomic writes and only a 
single bio is permitted to be created per write. Are more integrity 
checks required?

Thanks,
John
Christoph Hellwig Nov. 10, 2023, 6:29 a.m. UTC | #9
On Thu, Nov 09, 2023 at 07:08:40PM +0000, John Garry wrote:
>>> send a write that crossed the atomic write limit, but the drive wouldn't
>>> guarantee that it was atomic except at the atomic write boundary.
>>> Eg with an AWUN of 16kB, you could send five 16kB writes, combine them
>>> into a single 80kB write, and if the power failed midway through, the
>>> drive would guarantee that it had written 0, 16kB, 32kB, 48kB, 64kB or
>>> all 80kB.  Not necessarily in order; it might have written bytes 16-32kB,
>>> 64-80kB and not the other three.
>
> I didn't think that there are any atomic write guarantees at all if we ever 
> exceed AWUN or AWUPF or cross the atomic write boundary (if any).

You're quoting a few mails before me, but I agree.

>> I can see some use for that, but I'm really worried that debugging
>> problems in the I/O merging and splitting will be absolute hell.
>
> Even if bios were merged for NVMe the total request length still should not 
> exceed AWUPF. However a check can be added to ensure this for a submitted 
> atomic write request.

Yes.

> As for splitting, it is not permitted for atomic writes and only a single 
> bio is permitted to be created per write. Are more integrity checks 
> required?

I'm more worried about the problem where we accidentally add a split.
The whole bio merge/split path is convoluted and we had plenty of
bugs in the past by not looking at all the correct flags or opcodes.
John Garry Nov. 10, 2023, 8:44 a.m. UTC | #10
On 10/11/2023 06:29, Christoph Hellwig wrote:
> Yes.
> 
>> As for splitting, it is not permitted for atomic writes and only a single
>> bio is permitted to be created per write. Are more integrity checks
>> required?
> I'm more worried about the problem where we accidentally add a split.
> The whole bio merge/split path is convoluted and we had plenty of
> bugs in the past by not looking at all the correct flags or opcodes.

Yes, this is always a concern.

Some thoughts on things which could be done:
- For no merging, ensure request length is a power-of-2 when enqueuing 
to block driver. This is simple but not watertight.
- Create a per-bio checksum when the bio is created for the atomic write 
and ensure integrity when queuing to the block driver
- a new block layer datapath which ensures no merging or splitting, but 
this seems a bit OTT

BTW, on topic of splitting, that NVMe virt boundary is a pain and I hope 
that we could ignore/avoid it for atomic writes.

Thanks,
John
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21783aa2ee8e..aa0daacf4d7c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1926,6 +1926,35 @@  static void nvme_update_disk_info(struct gendisk *disk,
 	blk_queue_io_min(disk->queue, phys_bs);
 	blk_queue_io_opt(disk->queue, io_opt);
 
+	atomic_bs = rounddown_pow_of_two(atomic_bs);
+	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+		if (id->nabo) {
+			dev_err(ns->ctrl->device, "Support atomic NABO=%x\n",
+				id->nabo);
+		} else {
+			u32 boundary = 0;
+
+			if (le16_to_cpu(id->nabspf))
+				boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+
+			if (is_power_of_2(boundary) || !boundary) {
+				blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
+				blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
+				blk_queue_atomic_write_unit_max_sectors(disk->queue,
+									atomic_bs / bs);
+				blk_queue_atomic_write_boundary_bytes(disk->queue, boundary);
+			} else {
+				dev_err(ns->ctrl->device, "Unsupported atomic boundary=0x%x\n",
+					boundary);
+			}
+		}
+	} else if (ns->ctrl->subsys->awupf) {
+		blk_queue_atomic_write_max_bytes(disk->queue, atomic_bs);
+		blk_queue_atomic_write_unit_min_sectors(disk->queue, 1);
+		blk_queue_atomic_write_unit_max_sectors(disk->queue, atomic_bs / bs);
+		blk_queue_atomic_write_boundary_bytes(disk->queue, 0);
+	}
+
 	/*
 	 * Register a metadata profile for PI, or the plain non-integrity NVMe
 	 * metadata masquerading as Type 0 if supported, otherwise reject block