Message ID | 20240124113841.31824-16-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | block atomic writes | expand |
On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote: > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 5045c84f2516..6a34a5d92088 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, > if (req->cmd_flags & REQ_RAHEAD) > dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; > > + /* > + * Ensure that nothing has been sent which cannot be executed > + * atomically. > + */ > + if (req->cmd_flags & REQ_ATOMIC) { > + struct nvme_ns_head *head = ns->head; > + u32 boundary_bytes = head->atomic_boundary; > + > + if (blk_rq_bytes(req) > ns->head->atomic_max) > + return BLK_STS_IOERR; > + > + if (boundary_bytes) { > + u32 mask = boundary_bytes - 1, imask = ~mask; > + u32 start = blk_rq_pos(req) << SECTOR_SHIFT; > + u32 end = start + blk_rq_bytes(req); > + > + if (blk_rq_bytes(req) > boundary_bytes) > + return BLK_STS_IOERR; > + > + if (((start & imask) != (end & imask)) && > + (end & mask)) { > + return BLK_STS_IOERR; > + } > + } > + } Aren't these new fields, atomic_max and atomic_boundary, duplicates of the equivalent queue limits? Let's just use the queue limits instead. And couldn't we generically validate the constraints are not violated in submit_bio_noacct() instead of doing that in the low level driver? The driver assumes all other requests are already sanity checked, so I don't think we should change the responsibility for that just for this flag.
On 25/01/2024 00:52, Keith Busch wrote: > On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote: >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >> index 5045c84f2516..6a34a5d92088 100644 >> --- a/drivers/nvme/host/core.c >> +++ b/drivers/nvme/host/core.c >> @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, >> if (req->cmd_flags & REQ_RAHEAD) >> dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; >> >> + /* >> + * Ensure that nothing has been sent which cannot be executed >> + * atomically. >> + */ >> + if (req->cmd_flags & REQ_ATOMIC) { >> + struct nvme_ns_head *head = ns->head; >> + u32 boundary_bytes = head->atomic_boundary; >> + >> + if (blk_rq_bytes(req) > ns->head->atomic_max) >> + return BLK_STS_IOERR; >> + >> + if (boundary_bytes) { >> + u32 mask = boundary_bytes - 1, imask = ~mask; >> + u32 start = blk_rq_pos(req) << SECTOR_SHIFT; >> + u32 end = start + blk_rq_bytes(req); >> + >> + if (blk_rq_bytes(req) > boundary_bytes) >> + return BLK_STS_IOERR; >> + >> + if (((start & imask) != (end & imask)) && >> + (end & mask)) { >> + return BLK_STS_IOERR; >> + } >> + } >> + } > Aren't these new fields, atomic_max and atomic_boundary, duplicates of > the equivalent queue limits? Let's just use the queue limits instead. I was making a copy of these limits in the driver out of an abundance of caution. I suppose the atomic_max and atomic_boundary values won't be modified in the block layer, so I could use them instead. > > And couldn't we generically validate the constraints are not violated in > submit_bio_noacct() instead of doing that in the low level driver? The > driver assumes all other requests are already sanity checked, so I don't > think we should change the responsibility for that just for this flag. As a safety mechanism, we want to ensure the complete stack is not giving us out-of-limits atomic writes. We have limits checks in XFS iomap and fops.c, but we would also want to ensure that the the block layer is not doing anything it shouldn't be doing after submit_bio_noacct(), like merging atomic write BIOs which straddle a boundary or exceed atomic_max (if there were any merging). The SCSI standard already has provision for error'ing an atomic write command which exceeds the target atomic write capabilities, while NVMe doesn't. BTW, Christoph did mention that he would like to see this: https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/ Thanks, John
On 1/24/2024 4:52 PM, Keith Busch wrote: > On Wed, Jan 24, 2024 at 11:38:41AM +0000, John Garry wrote: >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >> index 5045c84f2516..6a34a5d92088 100644 >> --- a/drivers/nvme/host/core.c >> +++ b/drivers/nvme/host/core.c >> @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, >> if (req->cmd_flags & REQ_RAHEAD) >> dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; >> >> + /* >> + * Ensure that nothing has been sent which cannot be executed >> + * atomically. >> + */ >> + if (req->cmd_flags & REQ_ATOMIC) { >> + struct nvme_ns_head *head = ns->head; >> + u32 boundary_bytes = head->atomic_boundary; >> + >> + if (blk_rq_bytes(req) > ns->head->atomic_max) >> + return BLK_STS_IOERR; >> + >> + if (boundary_bytes) { >> + u32 mask = boundary_bytes - 1, imask = ~mask; >> + u32 start = blk_rq_pos(req) << SECTOR_SHIFT; >> + u32 end = start + blk_rq_bytes(req); >> + >> + if (blk_rq_bytes(req) > boundary_bytes) >> + return BLK_STS_IOERR; >> + >> + if (((start & imask) != (end & imask)) && >> + (end & mask)) { >> + return BLK_STS_IOERR; >> + } >> + } >> + } > > Aren't these new fields, atomic_max and atomic_boundary, duplicates of > the equivalent queue limits? Let's just use the queue limits instead. > > And couldn't we generically validate the constraints are not violated in > submit_bio_noacct() instead of doing that in the low level driver? The > driver assumes all other requests are already sanity checked, so I don't > think we should change the responsibility for that just for this flag. > does it makes sense to move about code to the helper ? perhaps inline ? -ck
On Thu, Jan 25, 2024 at 11:28:22AM +0000, John Garry wrote: > We have limits checks in XFS iomap and fops.c, but we would also want to > ensure that the the block layer is not doing anything it shouldn't be doing > after submit_bio_noacct(), like merging atomic write BIOs which straddle a > boundary or exceed atomic_max (if there were any merging). > > The SCSI standard already has provision for error'ing an atomic write > command which exceeds the target atomic write capabilities, while NVMe > doesn't. Can you get Oracle to propose this for NVMe? It always helps if these suggestions come from a large buyer of NVMe equipment. > BTW, Christoph did mention that he would like to see this: > https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/ I can probably live with a sufficiently low-level block layer check.
On 29/01/2024 06:20, Christoph Hellwig wrote: > On Thu, Jan 25, 2024 at 11:28:22AM +0000, John Garry wrote: >> We have limits checks in XFS iomap and fops.c, but we would also want to >> ensure that the the block layer is not doing anything it shouldn't be doing >> after submit_bio_noacct(), like merging atomic write BIOs which straddle a >> boundary or exceed atomic_max (if there were any merging). >> >> The SCSI standard already has provision for error'ing an atomic write >> command which exceeds the target atomic write capabilities, while NVMe >> doesn't. > > Can you get Oracle to propose this for NVMe? It always helps if these > suggestions come from a large buyer of NVMe equipment. I'll let Martin comment on that. > >> BTW, Christoph did mention that he would like to see this: >> https://lore.kernel.org/linux-nvme/20231109153603.GA2188@lst.de/ > > I can probably live with a sufficiently low-level block layer check. That would probably be in blk_mq_dispatch_rq_list() for block drivers with .queue_rq set, but I would need to check for a good place for ->queue_rqs . I can't imagine that we just want to inefficiently iter all rqs at the ->queue_rqs call point. However considering the nature of this change, it is not a good sign that we/I need to check... I'd be more inclined to leave as is. Thanks, John
On Mon, Jan 29, 2024 at 09:36:38AM +0000, John Garry wrote: > That would probably be in blk_mq_dispatch_rq_list() for block drivers with > .queue_rq set, but I would need to check for a good place for ->queue_rqs . > I can't imagine that we just want to inefficiently iter all rqs at the > ->queue_rqs call point. > > However considering the nature of this change, it is not a good sign that > we/I need to check... I'd be more inclined to leave as is. Heh. At least early on having the checks in one place in nvme makes me feel easier for sure. If we can easily use the block limits for the checks we shouldn't have to keep duplicate values in nvme, though.
If we don't end up doing the checks in the block layer: > + /* > + * Ensure that nothing has been sent which cannot be executed > + * atomically. > + */ > + if (req->cmd_flags & REQ_ATOMIC) { > + struct nvme_ns_head *head = ns->head; > + u32 boundary_bytes = head->atomic_boundary; ... please split the checks into a helper. And merge them into the previous patch.
On 13/02/2024 06:42, Christoph Hellwig wrote: > If we don't end up doing the checks in the block layer: > >> + /* >> + * Ensure that nothing has been sent which cannot be executed >> + * atomically. >> + */ >> + if (req->cmd_flags & REQ_ATOMIC) { >> + struct nvme_ns_head *head = ns->head; >> + u32 boundary_bytes = head->atomic_boundary; > > ... please split the checks into a helper. ok > And merge them into the > previous patch. Fine, if you prefer that. >
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5045c84f2516..6a34a5d92088 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -911,6 +911,32 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + /* + * Ensure that nothing has been sent which cannot be executed + * atomically. + */ + if (req->cmd_flags & REQ_ATOMIC) { + struct nvme_ns_head *head = ns->head; + u32 boundary_bytes = head->atomic_boundary; + + if (blk_rq_bytes(req) > ns->head->atomic_max) + return BLK_STS_IOERR; + + if (boundary_bytes) { + u32 mask = boundary_bytes - 1, imask = ~mask; + u32 start = blk_rq_pos(req) << SECTOR_SHIFT; + u32 end = start + blk_rq_bytes(req); + + if (blk_rq_bytes(req) > boundary_bytes) + return BLK_STS_IOERR; + + if (((start & imask) != (end & imask)) && + (end & mask)) { + return BLK_STS_IOERR; + } + } + } + cmnd->rw.opcode = op; cmnd->rw.flags = 0; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); @@ -1912,7 +1938,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, } static void nvme_update_atomic_write_disk_info(struct gendisk *disk, - struct nvme_ctrl *ctrl, struct nvme_id_ns *id, u32 bs, u32 atomic_bs) + struct nvme_ctrl *ctrl, struct nvme_ns_head *head, + struct nvme_id_ns *id, u32 bs, u32 atomic_bs) { unsigned int unit_min = 0, unit_max = 0, boundary = 0, max_bytes = 0; struct request_queue *q = disk->queue; @@ -1943,6 +1970,9 @@ static void nvme_update_atomic_write_disk_info(struct gendisk *disk, unit_max = rounddown_pow_of_two(atomic_bs); } + head->atomic_max = max_bytes; + head->atomic_boundary = boundary; + blk_queue_atomic_write_max_bytes(q, max_bytes); blk_queue_atomic_write_unit_min_sectors(q, unit_min >> SECTOR_SHIFT); blk_queue_atomic_write_unit_max_sectors(q, unit_max >> SECTOR_SHIFT); @@ -1980,7 +2010,8 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, else atomic_bs = (1 + ctrl->subsys->awupf) * bs; - nvme_update_atomic_write_disk_info(disk, ctrl, id, bs, atomic_bs); + nvme_update_atomic_write_disk_info(disk, ctrl, head, id, + bs, atomic_bs); } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 030c80818240..938060c85e6f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -476,6 +476,8 @@ struct nvme_ns_head { struct device cdev_device; struct gendisk *disk; + u32 atomic_max; + u32 atomic_boundary; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock;