diff mbox series

[PATCHv10,4/9] block: allow ability to limit partition write hints

Message ID 20241029151922.459139-5-kbusch@meta.com (mailing list archive)
State New
Headers show
Series write hints with nvme fdp, scsi streams | expand

Commit Message

Keith Busch Oct. 29, 2024, 3:19 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

When multiple partitions are used, you may want to enforce different
subsets of the available write hints for each partition. Provide a
bitmap attribute of the available write hints, and allow an admin to
write a different mask to set the partition's allowed write hints.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 Documentation/ABI/stable/sysfs-block |  6 ++++
 block/bdev.c                         | 13 ++++++++
 block/partitions/core.c              | 44 ++++++++++++++++++++++++++--
 include/linux/blk_types.h            |  1 +
 4 files changed, 62 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig Oct. 29, 2024, 3:23 p.m. UTC | #1
On Tue, Oct 29, 2024 at 08:19:17AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> When multiple partitions are used, you may want to enforce different
> subsets of the available write hints for each partition. Provide a
> bitmap attribute of the available write hints, and allow an admin to
> write a different mask to set the partition's allowed write hints.

This still offers all hints to all partitions by default, which still
breaks all use cases where you have actual users of the stream separation
on multiple paritions.

Please assign either all resources to the first partition and none to
the others (probably the easiest and useful for the most common use
case) or split it evenly.

Note that the bdev_max_streams value also needs to be adjusted to only
return the number of streams actually available to a partition.
Bart Van Assche Oct. 29, 2024, 5:25 p.m. UTC | #2
On 10/29/24 8:19 AM, Keith Busch wrote:
> +static ssize_t part_write_hint_mask_store(struct device *dev,
> +					  struct device_attribute *attr,
> +					  const char *buf, size_t count)
> +{
> +	struct block_device *bdev = dev_to_bdev(dev);
> +	unsigned short max_write_hints = bdev_max_write_hints(bdev);
> +	unsigned long *new_mask;
> +
> +	if (!max_write_hints)
> +		return count;
> +
> +	new_mask = bitmap_alloc(max_write_hints, GFP_KERNEL);
> +	if (!new_mask)
> +		return -ENOMEM;
> +
> +	bitmap_parse(buf, count, new_mask, max_write_hints);
> +	bitmap_copy(bdev->write_hint_mask, new_mask, max_write_hints);
> +	bitmap_free(new_mask);
> +
> +	return count;
> +}

bitmap_copy() is not atomic. Shouldn't the bitmap_copy() call be
serialized against the code that tests bits in bdev->write_hint_mask?

Thanks,

Bart.
Christoph Hellwig Oct. 30, 2024, 4:46 a.m. UTC | #3
On Tue, Oct 29, 2024 at 10:25:11AM -0700, Bart Van Assche wrote:
>> +}
>
> bitmap_copy() is not atomic. Shouldn't the bitmap_copy() call be
> serialized against the code that tests bits in bdev->write_hint_mask?

It needs something.  I actually pointed that out last round, but forgot
about it again this time :)
diff mbox series

Patch

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index f2db2cabb8e75..8ab9f030e61d1 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -187,6 +187,12 @@  Description:
 		partition is offset from the internal allocation unit's
 		natural alignment.
 
+What:		/sys/block/<disk>/<partition>/write_hint_mask
+Date:		October 2024
+Contact:	linux-block@vger.kernel.org
+Description:
+		The mask of allowed write hints. You can limit which hints the
+		block layer will use by writing a new mask.
 
 What:		/sys/block/<disk>/<partition>/stat
 Date:		February 2008
diff --git a/block/bdev.c b/block/bdev.c
index 9a59f0c882170..0c5e87b111aed 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -415,6 +415,7 @@  void __init bdev_cache_init(void)
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
 	struct block_device *bdev;
+	unsigned short write_hint;
 	struct inode *inode;
 
 	inode = new_inode(blockdev_superblock);
@@ -440,6 +441,18 @@  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+
+	write_hint = bdev_max_write_hints(bdev);
+	if (write_hint) {
+		bdev->write_hint_mask = bitmap_alloc(write_hint, GFP_KERNEL);
+		if (!bdev->write_hint_mask) {
+			free_percpu(bdev->bd_stats);
+			iput(inode);
+			return NULL;
+		}
+		bitmap_set(bdev->write_hint_mask, 0, write_hint);
+	}
+
 	return bdev;
 }
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b8..24e1a79972f75 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -203,6 +203,40 @@  static ssize_t part_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
 }
 
+static ssize_t part_write_hint_mask_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	unsigned short max_write_hints = bdev_max_write_hints(bdev);
+
+	if (!max_write_hints)
+		return sprintf(buf, "0");
+	return sprintf(buf, "%*pb\n", max_write_hints, bdev->write_hint_mask);
+}
+
+static ssize_t part_write_hint_mask_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	unsigned short max_write_hints = bdev_max_write_hints(bdev);
+	unsigned long *new_mask;
+
+	if (!max_write_hints)
+		return count;
+
+	new_mask = bitmap_alloc(max_write_hints, GFP_KERNEL);
+	if (!new_mask)
+		return -ENOMEM;
+
+	bitmap_parse(buf, count, new_mask, max_write_hints);
+	bitmap_copy(bdev->write_hint_mask, new_mask, max_write_hints);
+	bitmap_free(new_mask);
+
+	return count;
+}
+
 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
 static DEVICE_ATTR(start, 0444, part_start_show, NULL);
 static DEVICE_ATTR(size, 0444, part_size_show, NULL);
@@ -211,6 +245,8 @@  static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(write_hint_mask, 0644, part_write_hint_mask_show,
+		   part_write_hint_mask_store);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
@@ -225,6 +261,7 @@  static struct attribute *part_attrs[] = {
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+	&dev_attr_write_hint_mask.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -245,8 +282,11 @@  static const struct attribute_group *part_attr_groups[] = {
 
 static void part_release(struct device *dev)
 {
-	put_disk(dev_to_bdev(dev)->bd_disk);
-	bdev_drop(dev_to_bdev(dev));
+	struct block_device *part = dev_to_bdev(dev);
+
+	bitmap_free(part->write_hint_mask);
+	put_disk(part->bd_disk);
+	bdev_drop(part);
 }
 
 static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6737795220e18..af430e543f7f7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -73,6 +73,7 @@  struct block_device {
 #ifdef CONFIG_SECURITY
 	void			*bd_security;
 #endif
+	unsigned long		*write_hint_mask;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path