mbox series

[PATCHv10,0/9] write hints with nvme fdp, scsi streams

Message ID 20241029151922.459139-1-kbusch@meta.com (mailing list archive)
Headers show
Series write hints with nvme fdp, scsi streams | expand

Message

Keith Busch Oct. 29, 2024, 3:19 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Changes from v9:

  Document the partition hint mask

  Use bitmap_alloc API

  Fixup bitmap memory leak

  Return invalid value if user requests an invalid write hint

  Added and exported a block device feature flag for indicating generic
  placement hint support

  Added statx write hint max field

  Added BUILD_BUG_ON check for new io_uring SQE fields.

  Added reviews

Kanchan Joshi (2):
  io_uring: enable per-io hinting capability
  nvme: enable FDP support

Keith Busch (7):
  block: use generic u16 for write hints
  block: introduce max_write_hints queue limit
  statx: add write hint information
  block: allow ability to limit partition write hints
  block, fs: add write hint to kiocb
  block: export placement hint feature
  scsi: set permanent stream count in block limits

 Documentation/ABI/stable/sysfs-block | 13 +++++
 block/bdev.c                         | 18 ++++++
 block/blk-settings.c                 |  5 ++
 block/blk-sysfs.c                    |  6 ++
 block/fops.c                         | 31 +++++++++-
 block/partitions/core.c              | 44 ++++++++++++++-
 drivers/nvme/host/core.c             | 84 ++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h             |  5 ++
 drivers/scsi/sd.c                    |  2 +
 fs/stat.c                            |  1 +
 include/linux/blk-mq.h               |  3 +-
 include/linux/blk_types.h            |  4 +-
 include/linux/blkdev.h               | 15 +++++
 include/linux/fs.h                   |  1 +
 include/linux/nvme.h                 | 19 +++++++
 include/linux/stat.h                 |  1 +
 include/uapi/linux/io_uring.h        |  4 ++
 include/uapi/linux/stat.h            |  3 +-
 io_uring/io_uring.c                  |  2 +
 io_uring/rw.c                        |  3 +-
 20 files changed, 253 insertions(+), 11 deletions(-)

Comments

Christoph Hellwig Oct. 29, 2024, 3:24 p.m. UTC | #1
On Tue, Oct 29, 2024 at 08:19:13AM -0700, Keith Busch wrote:
>   Return invalid value if user requests an invalid write hint
> 
>   Added and exported a block device feature flag for indicating generic
>   placement hint support

But it still talks of write hints everywhere and conflates the write
streams with the temperature hints which are completely different
beasts.
Chaitanya Kulkarni Oct. 30, 2024, 12:24 a.m. UTC | #2
On 10/29/24 08:19, Keith Busch wrote:
> From: Kanchan Joshi <joshi.k@samsung.com>
>
> Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host
> to control the placement of logical blocks so as to reduce the SSD WAF.
> Userspace can send the write hint information using io_uring or fcntl.
>
> Fetch the placement-identifiers if the device supports FDP. The incoming
> write-hint is mapped to a placement-identifier, which in turn is set in
> the DSPEC field of the write command.
>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Hui Qi <hui81.qi@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/core.c | 84 ++++++++++++++++++++++++++++++++++++++++
>   drivers/nvme/host/nvme.h |  5 +++
>   include/linux/nvme.h     | 19 +++++++++
>   3 files changed, 108 insertions(+)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 3de7555a7de74..bd7b89912ddb9 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -44,6 +44,20 @@ struct nvme_ns_info {
>   	bool is_removed;
>   };
>   
> +struct nvme_fdp_ruh_status_desc {
> +	u16 pid;
> +	u16 ruhid;
> +	u32 earutr;
> +	u64 ruamw;
> +	u8  rsvd16[16];
> +};
> +
> +struct nvme_fdp_ruh_status {
> +	u8  rsvd0[14];
> +	__le16 nruhsd;
> +	struct nvme_fdp_ruh_status_desc ruhsd[];
> +};
> +
>   unsigned int admin_timeout = 60;
>   module_param(admin_timeout, uint, 0644);
>   MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
> @@ -657,6 +671,7 @@ static void nvme_free_ns_head(struct kref *ref)
>   	ida_free(&head->subsys->ns_ida, head->instance);
>   	cleanup_srcu_struct(&head->srcu);
>   	nvme_put_subsystem(head->subsys);
> +	kfree(head->plids);
>   	kfree(head);
>   }
>   
> @@ -974,6 +989,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>   	if (req->cmd_flags & REQ_RAHEAD)
>   		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>   
> +	if (req->write_hint && ns->head->nr_plids) {
> +		u16 hint = max(req->write_hint, ns->head->nr_plids);
> +
> +		dsmgmt |= ns->head->plids[hint - 1] << 16;
> +		control |= NVME_RW_DTYPE_DPLCMT;
> +	}
> +
>   	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
>   		return BLK_STS_INVAL;
>   
> @@ -2105,6 +2127,52 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
>   	return ret;
>   }
>   
> +static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
> +{
> +	struct nvme_fdp_ruh_status_desc *ruhsd;
> +	struct nvme_ns_head *head = ns->head;
> +	struct nvme_fdp_ruh_status *ruhs;
> +	struct nvme_command c = {};
> +	int size, ret, i;
> +
> +	if (head->plids)
> +		return 0;
> +
> +	size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS);
> +	ruhs = kzalloc(size, GFP_KERNEL);
> +	if (!ruhs)
> +		return -ENOMEM;
> +
> +	c.imr.opcode = nvme_cmd_io_mgmt_recv;
> +	c.imr.nsid = cpu_to_le32(nsid);
> +	c.imr.mo = 0x1;

can we please add some comment where values are hardcoded ?

> +	c.imr.numd =  cpu_to_le32((size >> 2) - 1);
> +
> +	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
> +	if (ret)
> +		goto out;
> +
> +	i = le16_to_cpu(ruhs->nruhsd);

instead of i why can't we use local variable nr_plids ?



> +	if (!i)
> +		goto out;
> +
> +	ns->head->nr_plids = min_t(u16, i, NVME_MAX_PLIDS);
> +	head->plids = kcalloc(ns->head->nr_plids, sizeof(head->plids),
> +			      GFP_KERNEL);
> +	if (!head->plids) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < ns->head->nr_plids; i++) {
> +		ruhsd = &ruhs->ruhsd[i];
> +		head->plids[i] = le16_to_cpu(ruhsd->pid);
> +	}
> +out:
> +	kfree(ruhs);
> +	return ret;
> +}
> +
>   static int nvme_update_ns_info_block(struct nvme_ns *ns,
>   		struct nvme_ns_info *info)
>   {
> @@ -2141,6 +2209,19 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
>   			goto out;
>   	}
>   
> +	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
> +		ret = nvme_fetch_fdp_plids(ns, info->nsid);
> +		if (ret)
> +			dev_warn(ns->ctrl->device,
> +				"FDP failure status:0x%x\n", ret);
> +		if (ret < 0)
> +			goto out;
> +	} else {
> +		ns->head->nr_plids = 0;
> +		kfree(ns->head->plids);
> +		ns->head->plids = NULL;
> +	}
> +
>   	blk_mq_freeze_queue(ns->disk->queue);
>   	ns->head->lba_shift = id->lbaf[lbaf].ds;
>   	ns->head->nuse = le64_to_cpu(id->nuse);
> @@ -2171,6 +2252,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
>   	if (!nvme_init_integrity(ns->head, &lim, info))
>   		capacity = 0;
>   
> +	lim.max_write_hints = ns->head->nr_plids;
> +	if (lim.max_write_hints)
> +		lim.features |= BLK_FEAT_PLACEMENT_HINTS;
>   	ret = queue_limits_commit_update(ns->disk->queue, &lim);
>   	if (ret) {
>   		blk_mq_unfreeze_queue(ns->disk->queue);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 093cb423f536b..cec8e5d96377b 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -454,6 +454,8 @@ struct nvme_ns_ids {
>   	u8	csi;
>   };
>   
> +#define NVME_MAX_PLIDS   (NVME_CTRL_PAGE_SIZE / sizeof(16))

this calculates how many plids can fit into the ctrl page size ?

sorry but I didn't understand sizeof(16) here, since plids are u16

nvme_ns_head -> u16 *plidsshould this be sizeof(u16) ? -ck