diff mbox series

[14/15] nvme: enable FDP support

Message ID 20241119121632.1225556-15-hch@lst.de (mailing list archive)
State New
Headers show
Series [01/15] fs: add write stream information to statx | expand

Commit Message

Christoph Hellwig Nov. 19, 2024, 12:16 p.m. UTC
Wire up the block level write streams to the NVMe Flexible Data Placement
(FDP) feature as ratified in TP 4146a.

Based on code from Kanchan Joshi <joshi.k@samsung.com>,
Hui Qi <hui81.qi@samsung.com>, Nitesh Shetty <nj.shetty@samsung.com> and
Keith Busch <kbusch@kernel.org>, but a lot of it has been rewritten to
fit the block layer write stream infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 129 +++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |   4 ++
 2 files changed, 133 insertions(+)

Comments

Keith Busch Nov. 19, 2024, 6:17 p.m. UTC | #1
On Tue, Nov 19, 2024 at 01:16:28PM +0100, Christoph Hellwig wrote:
> +static int nvme_read_fdp_config(struct nvme_ns *ns, struct nvme_ns_info *info)
> +{
> +	struct nvme_fdp_config result;
> +	struct nvme_fdp_config_log *log;
> +	struct nvme_fdp_config_desc *configs;
> +	size_t log_size;
> +	int error;
> +
> +	error = nvme_get_features(ns->ctrl, NVME_FEAT_FDP, info->endgid, NULL,
> +			0, &result);
> +	if (error)
> +		return error;
> +
> +	if (!(result.flags & FDPCFG_FDPE)) {
> +		dev_warn(ns->ctrl->device, "FDP not enable in current config\n");
> +		return -EINVAL;
> +	}
> +
> +	log_size = sizeof(*log) + (result.fdpcidx + 1) * sizeof(*configs);
> +	log = kmalloc(log_size, GFP_KERNEL);
> +	if (!log)
> +		return -ENOMEM;
> +
> +	error = nvme_get_log_lsi(ns->ctrl, info->nsid, NVME_LOG_FDP_CONFIGS,
> +			0, 0, log, log_size, 0, info->endgid);
> +	if (error) {
> +		dev_warn(ns->ctrl->device,
> +			"failed to read FDP config log: 0x%x\n", error);
> +		goto out_free_log;
> +	}
> +
> +	if (le32_to_cpu(log->size) < log_size) {
> +		dev_warn(ns->ctrl->device, "FDP log too small: %d vs %zd\n",
> +				le32_to_cpu(log->size), log_size);
> +		error = -EINVAL;
> +		goto out_free_log;
> +	}
> +
> +	configs = (struct nvme_fdp_config_desc *)(log + 1);
> +	if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> +		dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");

Why not support multiple reclaim groups?

> +		return -EINVAL;
> +	}

> +	ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);

The config descriptors are variable length, so you can't just index into
it. You have to read each index individually to get the next index's offset.
Something like:

	struct nvme_fdp_config_desc *configs;
	void *l;
	int i;

	...

	l = log + 1;
	for (i = 0; i < result.fdpcidx; i++) {
		configs = l;
		l += le16_to_cpu(configs->size);
	}
	ns->head->runs = le64_to_cpu(configs->runs);
Christoph Hellwig Nov. 19, 2024, 6:24 p.m. UTC | #2
On Tue, Nov 19, 2024 at 11:17:36AM -0700, Keith Busch wrote:
> > +	if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> > +		dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
> 
> Why not support multiple reclaim groups?

Can you come up with a sane API for that?  And can you find devices in
the wild that actually support it?

> > +	ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);
> 
> The config descriptors are variable length, so you can't just index into
> it. You have to read each index individually to get the next index's offset.
> Something like:

Indeed.  The current code only works when the first config is selected.
Keith Busch Nov. 19, 2024, 10:49 p.m. UTC | #3
On Tue, Nov 19, 2024 at 07:24:27PM +0100, Christoph Hellwig wrote:
> On Tue, Nov 19, 2024 at 11:17:36AM -0700, Keith Busch wrote:
> > > +	if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> > > +		dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
> > 
> > Why not support multiple reclaim groups?
> 
> Can you come up with a sane API for that? 

Haven't really thought about it. If it's there, it's probably useful for
RU's that are not "Persistently Isolated". But let's not worry about it
now, we can just say you don't get to use write streams for these.

> And can you find devices in
> the wild that actually support it?

I haven't come across any, no.

But more about the return codes for pretty much all the errors here.
They'll prevent the namespace from being visible, but I think you just
want to set the limits to disable write streams instead. Otherwise it'd
be a regression since namespaces configured this way are currently
usable.
Christoph Hellwig Nov. 20, 2024, 6:03 a.m. UTC | #4
On Tue, Nov 19, 2024 at 03:49:14PM -0700, Keith Busch wrote:
> But more about the return codes for pretty much all the errors here.
> They'll prevent the namespace from being visible, but I think you just
> want to set the limits to disable write streams instead. Otherwise it'd
> be a regression since namespaces configured this way are currently
> usable.

True, we should probably just log an error and continue here.  I'll
update it for the next version.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b61225201b47..543bbe7de063 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -673,6 +673,7 @@  static void nvme_free_ns_head(struct kref *ref)
 	ida_free(&head->subsys->ns_ida, head->instance);
 	cleanup_srcu_struct(&head->srcu);
 	nvme_put_subsystem(head->subsys);
+	kfree(head->plids);
 	kfree(head);
 }
 
@@ -990,6 +991,15 @@  static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
+	if (op == nvme_cmd_write && ns->head->nr_plids) {
+		u16 write_stream = req->bio->bi_write_stream;
+
+		if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+			return BLK_STS_INVAL;
+		dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+		control |= NVME_RW_DTYPE_DPLCMT;
+	}
+
 	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
 		return BLK_STS_INVAL;
 
@@ -2142,6 +2152,107 @@  static int nvme_update_ns_info_generic(struct nvme_ns *ns,
 	return ret;
 }
 
+static int nvme_read_fdp_config(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+	struct nvme_fdp_config result;
+	struct nvme_fdp_config_log *log;
+	struct nvme_fdp_config_desc *configs;
+	size_t log_size;
+	int error;
+
+	error = nvme_get_features(ns->ctrl, NVME_FEAT_FDP, info->endgid, NULL,
+			0, &result);
+	if (error)
+		return error;
+
+	if (!(result.flags & FDPCFG_FDPE)) {
+		dev_warn(ns->ctrl->device, "FDP not enable in current config\n");
+		return -EINVAL;
+	}
+
+	log_size = sizeof(*log) + (result.fdpcidx + 1) * sizeof(*configs);
+	log = kmalloc(log_size, GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	error = nvme_get_log_lsi(ns->ctrl, info->nsid, NVME_LOG_FDP_CONFIGS,
+			0, 0, log, log_size, 0, info->endgid);
+	if (error) {
+		dev_warn(ns->ctrl->device,
+			"failed to read FDP config log: 0x%x\n", error);
+		goto out_free_log;
+	}
+
+	if (le32_to_cpu(log->size) < log_size) {
+		dev_warn(ns->ctrl->device, "FDP log too small: %d vs %zd\n",
+				le32_to_cpu(log->size), log_size);
+		error = -EINVAL;
+		goto out_free_log;
+	}
+
+	configs = (struct nvme_fdp_config_desc *)(log + 1);
+	if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
+		dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
+		return -EINVAL;
+	}
+	ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);
+
+out_free_log:
+	kfree(log);
+	return error;
+}
+
+static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
+{
+	struct nvme_ns_head *head = ns->head;
+	struct nvme_fdp_ruh_status *ruhs;
+	const unsigned int max_nr_plids = S8_MAX - 1;
+	size_t size = struct_size(ruhs, ruhsd, max_nr_plids);
+	struct nvme_command c = {
+		.imr.opcode	= nvme_cmd_io_mgmt_recv,
+		.imr.nsid	= cpu_to_le32(nsid),
+		.imr.mo		= NVME_IO_MGMT_RECV_MO_RUHS,
+		.imr.numd	= cpu_to_le32(nvme_bytes_to_numd(size)),
+	};
+	int ret, i;
+
+	ruhs = kzalloc(size, GFP_KERNEL);
+	if (!ruhs)
+		return -ENOMEM;
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+	if (ret) {
+		dev_warn(ns->ctrl->device,
+			"failed to read FDP reclaim unit handles: 0x%x\n", ret);
+		goto out;
+	}
+
+	ns->head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+	if (!ns->head->nr_plids)
+		goto out;
+
+	if (ns->head->nr_plids > max_nr_plids) {
+		dev_info(ns->ctrl->device,
+			"capping max write streams from %d to %d\n",
+			ns->head->nr_plids, max_nr_plids);
+		ns->head->nr_plids = max_nr_plids;
+	}
+
+	head->plids = kcalloc(ns->head->nr_plids, sizeof(head->plids),
+			      GFP_KERNEL);
+	if (!head->plids) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < ns->head->nr_plids; i++)
+		head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
+
+out:
+	kfree(ruhs);
+	return ret;
+}
+
 static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		struct nvme_ns_info *info)
 {
@@ -2178,6 +2289,18 @@  static int nvme_update_ns_info_block(struct nvme_ns *ns,
 			goto out;
 	}
 
+	if (!(ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS)) {
+		ns->head->nr_plids = 0;
+		kfree(ns->head->plids);
+		ns->head->plids = NULL;
+	} else if (!ns->head->plids) {
+		ret = nvme_read_fdp_config(ns, info);
+		if (!ret)
+			ret = nvme_fetch_fdp_plids(ns, info->nsid);
+		if (ret < 0)
+			goto out;
+	}
+
 	blk_mq_freeze_queue(ns->disk->queue);
 	ns->head->lba_shift = id->lbaf[lbaf].ds;
 	ns->head->nuse = le64_to_cpu(id->nuse);
@@ -2211,6 +2334,10 @@  static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	if (!nvme_init_integrity(ns->head, &lim, info))
 		capacity = 0;
 
+	lim.max_write_streams = ns->head->nr_plids;
+	if (lim.max_write_streams)
+		lim.write_stream_granularity = ns->head->runs;
+
 	ret = queue_limits_commit_update(ns->disk->queue, &lim);
 	if (ret) {
 		blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2313,6 +2440,8 @@  static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
 			ns->head->disk->flags |= GENHD_FL_HIDDEN;
 		else
 			nvme_init_integrity(ns->head, &lim, info);
+		lim.max_write_streams = ns_lim->max_write_streams;
+		lim.write_stream_granularity = ns_lim->write_stream_granularity;
 		ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
 
 		set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8cea8416b0d2..f10aa0cb6df5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -493,6 +493,10 @@  struct nvme_ns_head {
 	struct gendisk		*disk;
 
 	u16			endgid;
+	u16                     nr_plids;
+	u16			*plids;
+	u64			runs;
+
 #ifdef CONFIG_NVME_MULTIPATH
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;