diff mbox series

[03/15] block: add an API to atomically update queue limits

Message ID 20240122173645.1686078-4-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/15] block: move max_{open,active}_zones to struct queue_limits | expand

Commit Message

Christoph Hellwig Jan. 22, 2024, 5:36 p.m. UTC
Add a new queue_limits_{start,commit}_update pair of functions that
allows taking an atomic snapshot of queue limits, update it, and
commit it if it passes validity checking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c       |   1 +
 block/blk-settings.c   | 140 +++++++++++++++++++++++++++++++++++++++++
 block/blk.h            |   1 +
 include/linux/blkdev.h |  23 +++++++
 4 files changed, 165 insertions(+)

Comments

Damien Le Moal Jan. 23, 2024, 4:50 a.m. UTC | #1
On 1/23/24 02:36, Christoph Hellwig wrote:
> Add a new queue_limits_{start,commit}_update pair of functions that
> allows taking an atomic snapshot of queue limits, update it, and
> commit it if it passes validity checking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-core.c       |   1 +
>  block/blk-settings.c   | 140 +++++++++++++++++++++++++++++++++++++++++
>  block/blk.h            |   1 +
>  include/linux/blkdev.h |  23 +++++++
>  4 files changed, 165 insertions(+)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 11342af420d0c4..09f4a44a4aa3cc 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -424,6 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id)
>  	mutex_init(&q->debugfs_mutex);
>  	mutex_init(&q->sysfs_lock);
>  	mutex_init(&q->sysfs_dir_lock);
> +	mutex_init(&q->limits_lock);
>  	mutex_init(&q->rq_qos_mutex);
>  	spin_lock_init(&q->queue_lock);
>  
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index e872b0e168525e..b6692143ccf034 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -96,6 +96,146 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
>  	bdi->io_pages = lim->max_sectors >> (PAGE_SHIFT - 9);
>  }
>  
> +static int blk_validate_zoned_limits(struct queue_limits *lim)
> +{
> +	if (!lim->zoned) {
> +		if (WARN_ON_ONCE(lim->max_open_zones) ||
> +		    WARN_ON_ONCE(lim->max_active_zones) ||
> +		    WARN_ON_ONCE(lim->zone_write_granularity) ||
> +		    WARN_ON_ONCE(lim->max_zone_append_sectors))
> +			return -EINVAL;
> +		return 0;
> +	}
> +
> +	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
> +		return -EINVAL;
> +
> +	if (lim->zone_write_granularity < lim->logical_block_size)
> +		lim->zone_write_granularity = lim->logical_block_size;

This check and change needs to be against the physical block size. Otherwise,
SMR drives will choke on it.

> +
> +	if (lim->max_zone_append_sectors) {
> +		if (WARN_ON_ONCE(!lim->chunk_sectors))
> +			return -EINVAL;

chunk_sectors is the zone size. So we should probably check that it is set after
the IS_ENABLED() check earlier in the function, no ?

> +		lim->max_zone_append_sectors =
> +			min3(lim->max_hw_sectors,
> +			     lim->max_zone_append_sectors,
> +			     lim->chunk_sectors);
> +	}
> +
> +	return 0;
> +}
> +
> +int blk_validate_limits(struct queue_limits *lim)
> +{
> +	unsigned int max_hw_sectors;
> +
> +	if (!lim->logical_block_size)
> +		lim->logical_block_size = SECTOR_SIZE;
> +
> +	if (!lim->physical_block_size)
> +		lim->physical_block_size = SECTOR_SIZE;
> +	if (lim->physical_block_size < lim->logical_block_size)
> +		lim->physical_block_size = lim->physical_block_size;
> +
> +	if (!lim->io_min)
> +		lim->io_min = SECTOR_SIZE;

This should be lim->logical_block_size, no ?

> +	if (lim->io_min < lim->physical_block_size)
> +		lim->io_min = lim->physical_block_size;

But then given that log <= phys, you could set io_min to physical_block_size if
it is not set.

> +
> +	if (!lim->max_hw_sectors)
> +		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
> +	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))

You can use PAGE_SECTORS here.

> +		return -EINVAL;
> +
> +	lim->max_hw_sectors = round_down(lim->max_hw_sectors,
> +			lim->logical_block_size >> SECTOR_SHIFT);
> +
> +	max_hw_sectors = min_not_zero(lim->max_hw_sectors,
> +				lim->max_dev_sectors);
> +	if (lim->max_user_sectors) {
> +		if (lim->max_user_sectors > max_hw_sectors ||
> +		    lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)

same here.

> +			return -EINVAL;
> +		lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
> +	} else {
> +		lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
> +	}
> +
> +	lim->max_sectors = round_down(lim->max_sectors,
> +			lim->logical_block_size >> SECTOR_SHIFT);
> +
> +	if (!lim->max_segments)
> +		lim->max_segments = BLK_MAX_SEGMENTS;
> +
> +	lim->max_discard_sectors = lim->max_hw_discard_sectors;
> +	if (!lim->max_discard_segments)
> +		lim->max_discard_segments = 1;
> +
> +	if (lim->discard_granularity < lim->physical_block_size)
> +		lim->discard_granularity = lim->physical_block_size;
> +
> +	if (!lim->seg_boundary_mask)
> +		lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
> +	if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
> +		return -EINVAL;
> +
> +	if (!lim->max_segment_size)
> +		lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
> +	if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
> +		return -EINVAL;
> +
> +	/*
> +	 * Devices that require a virtual boundary do not support scatter/gather
> +	 * I/O natively, but instead require a descriptor list entry for each
> +	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
> +	 * of that they are not limited by our notion of "segment size".
> +	 */
> +	if (lim->virt_boundary_mask) {
> +		if (WARN_ON_ONCE(lim->max_segment_size &&
> +				 lim->max_segment_size != UINT_MAX))
> +			return -EINVAL;
> +		lim->max_segment_size = UINT_MAX;
> +	}
> +
> +	if (!lim->dma_alignment)
> +		lim->dma_alignment = SECTOR_SIZE - 1;
> +	if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
> +		return -EINVAL;
> +
> +	if (lim->alignment_offset) {
> +		lim->alignment_offset &= (lim->physical_block_size - 1);
> +		lim->misaligned = 0;
> +	}
> +
> +	return blk_validate_zoned_limits(lim);
> +}
> +
> +/**
> + * queue_limits_commit_update - commit an atomic update of queue limits
> + * @q:		queue to update
> + * @lim:	limits to apply
> + *
> + * Apply the limits in @lim that were obtained from queue_limits_start_update()
> + * and updated by the caller to @q.
> + *
> + * Returns 0 if successful, else a negative error code.
> + */
> +int queue_limits_commit_update(struct request_queue *q,
> +		struct queue_limits *lim)
> +	__releases(q->limits_lock)
> +{
> +	int error = blk_validate_limits(lim);
> +
> +	if (!error) {
> +		q->limits = *lim;
> +		if (q->disk)
> +			blk_apply_bdi_limits(q->disk->bdi, lim);
> +	}
> +	mutex_unlock(&q->limits_lock);
> +	return error;
> +}
> +EXPORT_SYMBOL_GPL(queue_limits_commit_update);
> +
>  /**
>   * blk_queue_bounce_limit - set bounce buffer limit for queue
>   * @q: the request queue for the device
> diff --git a/block/blk.h b/block/blk.h
> index 1ef920f72e0f87..58b5dbac2a487d 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -447,6 +447,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
>  		unpin_user_page(page);
>  }
>  
> +int blk_validate_limits(struct queue_limits *lim);
>  struct request_queue *blk_alloc_queue(int node_id);
>  
>  int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 4a2e82c7971c86..5b5d3b238de1e7 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -473,6 +473,7 @@ struct request_queue {
>  
>  	struct mutex		sysfs_lock;
>  	struct mutex		sysfs_dir_lock;
> +	struct mutex		limits_lock;
>  
>  	/*
>  	 * for reusing dead hctx instance in case of updating
> @@ -861,6 +862,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
>  	return chunk_sectors - (offset & (chunk_sectors - 1));
>  }
>  
> +/**
> + * queue_limits_start_update - start an atomic update of queue limits
> + * @q:		queue to update
> + *
> + * This functions starts an atomic update of the queue limits.  It takes a lock
> + * to prevent other updates and returns a snapshot of the current limits that
> + * the caller can modify.  The caller must call queue_limits_commit_update()
> + * to finish the update.
> + *
> + * Context: process context.  The caller must have frozen the queue or ensured
> + * that there is outstanding I/O by other means.
> + */
> +static inline struct queue_limits
> +queue_limits_start_update(struct request_queue *q)
> +	__acquires(q->limits_lock)
> +{
> +	mutex_lock(&q->limits_lock);
> +	return q->limits;
> +}
> +int queue_limits_commit_update(struct request_queue *q,
> +		struct queue_limits *lim);
> +
>  /*
>   * Access functions for manipulating queue properties
>   */
Christoph Hellwig Jan. 23, 2024, 8:44 a.m. UTC | #2
On Tue, Jan 23, 2024 at 01:50:32PM +0900, Damien Le Moal wrote:
> > +			return -EINVAL;
> > +		return 0;
> > +	}
> > +
> > +	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
> > +		return -EINVAL;
> > +
> > +	if (lim->zone_write_granularity < lim->logical_block_size)
> > +		lim->zone_write_granularity = lim->logical_block_size;
> 
> This check and change needs to be against the physical block size. Otherwise,
> SMR drives will choke on it.

It probably should, but this mirrors what is done in
blk_queue_zone_write_granularity.  And I want to match that behavior
at least for now, we can then improve and document it once this is
the only interface to validate the limits.

> > +
> > +	if (lim->max_zone_append_sectors) {
> > +		if (WARN_ON_ONCE(!lim->chunk_sectors))
> > +			return -EINVAL;
> 
> chunk_sectors is the zone size. So we should probably check that it is set after
> the IS_ENABLED() check earlier in the function, no ?

Possibly.  In fact I'm wondering where the check comes from, as we
don't seem to have it in the existing helpers.

> > +	if (!lim->logical_block_size)
> > +		lim->logical_block_size = SECTOR_SIZE;
> > +
> > +	if (!lim->physical_block_size)
> > +		lim->physical_block_size = SECTOR_SIZE;
> > +	if (lim->physical_block_size < lim->logical_block_size)
> > +		lim->physical_block_size = lim->physical_block_size;
> > +
> > +	if (!lim->io_min)
> > +		lim->io_min = SECTOR_SIZE;
> 
> This should be lim->logical_block_size, no ?

This comes from the default in blk_set_default_limits.

> 
> > +	if (lim->io_min < lim->physical_block_size)
> > +		lim->io_min = lim->physical_block_size;
> 
> But then given that log <= phys, you could set io_min to physical_block_size if
> it is not set.

Which is what we do here, so the above is actually redundant and
can be removed.

> > +	if (!lim->max_hw_sectors)
> > +		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
> > +	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))
> 
> You can use PAGE_SECTORS here.

Yes.
Hannes Reinecke Jan. 24, 2024, 6:08 a.m. UTC | #3
On 1/22/24 18:36, Christoph Hellwig wrote:
> Add a new queue_limits_{start,commit}_update pair of functions that
> allows taking an atomic snapshot of queue limits, update it, and
> commit it if it passes validity checking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/blk-core.c       |   1 +
>   block/blk-settings.c   | 140 +++++++++++++++++++++++++++++++++++++++++
>   block/blk.h            |   1 +
>   include/linux/blkdev.h |  23 +++++++
>   4 files changed, 165 insertions(+)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 11342af420d0c4..09f4a44a4aa3cc 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -424,6 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id)
>   	mutex_init(&q->debugfs_mutex);
>   	mutex_init(&q->sysfs_lock);
>   	mutex_init(&q->sysfs_dir_lock);
> +	mutex_init(&q->limits_lock);
>   	mutex_init(&q->rq_qos_mutex);
>   	spin_lock_init(&q->queue_lock);
>   
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index e872b0e168525e..b6692143ccf034 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -96,6 +96,146 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
>   	bdi->io_pages = lim->max_sectors >> (PAGE_SHIFT - 9);
>   }
>   
> +static int blk_validate_zoned_limits(struct queue_limits *lim)
> +{
> +	if (!lim->zoned) {
> +		if (WARN_ON_ONCE(lim->max_open_zones) ||
> +		    WARN_ON_ONCE(lim->max_active_zones) ||
> +		    WARN_ON_ONCE(lim->zone_write_granularity) ||
> +		    WARN_ON_ONCE(lim->max_zone_append_sectors))
> +			return -EINVAL;
> +		return 0;
> +	}
> +
> +	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
> +		return -EINVAL;
> +
> +	if (lim->zone_write_granularity < lim->logical_block_size)
> +		lim->zone_write_granularity = lim->logical_block_size;
> +
> +	if (lim->max_zone_append_sectors) {
> +		if (WARN_ON_ONCE(!lim->chunk_sectors))
> +			return -EINVAL;
> +		lim->max_zone_append_sectors =
> +			min3(lim->max_hw_sectors,
> +			     lim->max_zone_append_sectors,
> +			     lim->chunk_sectors);
> +	}
> +
> +	return 0;
> +}
> +
> +int blk_validate_limits(struct queue_limits *lim)
> +{
> +	unsigned int max_hw_sectors;
> +
> +	if (!lim->logical_block_size)
> +		lim->logical_block_size = SECTOR_SIZE;
> +
> +	if (!lim->physical_block_size)
> +		lim->physical_block_size = SECTOR_SIZE;
> +	if (lim->physical_block_size < lim->logical_block_size)
> +		lim->physical_block_size = lim->physical_block_size;
> +
> +	if (!lim->io_min)
> +		lim->io_min = SECTOR_SIZE;
> +	if (lim->io_min < lim->physical_block_size)
> +		lim->io_min = lim->physical_block_size;
> +
> +	if (!lim->max_hw_sectors)
> +		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
> +	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))
> +		return -EINVAL;
> +
> +	lim->max_hw_sectors = round_down(lim->max_hw_sectors,
> +			lim->logical_block_size >> SECTOR_SHIFT);
> +
> +	max_hw_sectors = min_not_zero(lim->max_hw_sectors,
> +				lim->max_dev_sectors);
> +	if (lim->max_user_sectors) {
> +		if (lim->max_user_sectors > max_hw_sectors ||
> +		    lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
> +			return -EINVAL;
> +		lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
> +	} else {
> +		lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
> +	}
> +
> +	lim->max_sectors = round_down(lim->max_sectors,
> +			lim->logical_block_size >> SECTOR_SHIFT);
> +
> +	if (!lim->max_segments)
> +		lim->max_segments = BLK_MAX_SEGMENTS;
> +
> +	lim->max_discard_sectors = lim->max_hw_discard_sectors;
> +	if (!lim->max_discard_segments)
> +		lim->max_discard_segments = 1;
> +
> +	if (lim->discard_granularity < lim->physical_block_size)
> +		lim->discard_granularity = lim->physical_block_size;
> +
> +	if (!lim->seg_boundary_mask)
> +		lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
> +	if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
> +		return -EINVAL;
> +
> +	if (!lim->max_segment_size)
> +		lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
> +	if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
> +		return -EINVAL;
> +
> +	/*
> +	 * Devices that require a virtual boundary do not support scatter/gather
> +	 * I/O natively, but instead require a descriptor list entry for each
> +	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
> +	 * of that they are not limited by our notion of "segment size".
> +	 */
> +	if (lim->virt_boundary_mask) {
> +		if (WARN_ON_ONCE(lim->max_segment_size &&
> +				 lim->max_segment_size != UINT_MAX))
> +			return -EINVAL;
> +		lim->max_segment_size = UINT_MAX;
> +	}
> +
> +	if (!lim->dma_alignment)
> +		lim->dma_alignment = SECTOR_SIZE - 1;
> +	if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
> +		return -EINVAL;
> +
> +	if (lim->alignment_offset) {
> +		lim->alignment_offset &= (lim->physical_block_size - 1);
> +		lim->misaligned = 0;
> +	}
> +
> +	return blk_validate_zoned_limits(lim);
> +}
> +
> +/**
> + * queue_limits_commit_update - commit an atomic update of queue limits
> + * @q:		queue to update
> + * @lim:	limits to apply
> + *
> + * Apply the limits in @lim that were obtained from queue_limits_start_update()
> + * and updated by the caller to @q.
> + *
> + * Returns 0 if successful, else a negative error code.
> + */
> +int queue_limits_commit_update(struct request_queue *q,
> +		struct queue_limits *lim)
> +	__releases(q->limits_lock)
> +{
> +	int error = blk_validate_limits(lim);
> +
> +	if (!error) {
> +		q->limits = *lim;
> +		if (q->disk)
> +			blk_apply_bdi_limits(q->disk->bdi, lim);
> +	}
> +	mutex_unlock(&q->limits_lock);
> +	return error;
> +}
> +EXPORT_SYMBOL_GPL(queue_limits_commit_update);
> +
>   /**
>    * blk_queue_bounce_limit - set bounce buffer limit for queue
>    * @q: the request queue for the device
> diff --git a/block/blk.h b/block/blk.h
> index 1ef920f72e0f87..58b5dbac2a487d 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -447,6 +447,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
>   		unpin_user_page(page);
>   }
>   
> +int blk_validate_limits(struct queue_limits *lim);
>   struct request_queue *blk_alloc_queue(int node_id);
>   
>   int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 4a2e82c7971c86..5b5d3b238de1e7 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -473,6 +473,7 @@ struct request_queue {
>   
>   	struct mutex		sysfs_lock;
>   	struct mutex		sysfs_dir_lock;
> +	struct mutex		limits_lock;
>   
>   	/*
>   	 * for reusing dead hctx instance in case of updating
> @@ -861,6 +862,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
>   	return chunk_sectors - (offset & (chunk_sectors - 1));
>   }
>   
> +/**
> + * queue_limits_start_update - start an atomic update of queue limits
> + * @q:		queue to update
> + *
> + * This functions starts an atomic update of the queue limits.  It takes a lock
> + * to prevent other updates and returns a snapshot of the current limits that
> + * the caller can modify.  The caller must call queue_limits_commit_update()
> + * to finish the update.
> + *
> + * Context: process context.  The caller must have frozen the queue or ensured
> + * that there is outstanding I/O by other means.
> + */
> +static inline struct queue_limits
> +queue_limits_start_update(struct request_queue *q)
> +	__acquires(q->limits_lock)
> +{
> +	mutex_lock(&q->limits_lock);
> +	return q->limits;
> +}

I'm slightly confused about the lifetime of the returned structure.
By my understanding, the returned 'struct queue_limit' is allocated
on the stack of the caller, right?
Shouldn't we note this somewhere such that people don't start passing
the structure around, or, worse, calling 'kfree()' on it?

Cheers,

Hannes
Christoph Hellwig Jan. 24, 2024, 9:21 a.m. UTC | #4
On Wed, Jan 24, 2024 at 07:08:58AM +0100, Hannes Reinecke wrote:
>> + * that there is outstanding I/O by other means.
>> + */
>> +static inline struct queue_limits
>> +queue_limits_start_update(struct request_queue *q)
>> +	__acquires(q->limits_lock)
>> +{
>> +	mutex_lock(&q->limits_lock);
>> +	return q->limits;
>> +}
>
> I'm slightly confused about the lifetime of the returned structure.
> By my understanding, the returned 'struct queue_limit' is allocated
> on the stack of the caller, right?
> Shouldn't we note this somewhere such that people don't start passing
> the structure around, or, worse, calling 'kfree()' on it?

queue_limits_start_update returns the structure by value.  So the
lifetime is that of whatever variable you assign it to, which better
be on the stack.  Tryign to kfree a non-pointer type will get the
compiler complain, as does passing a reference to it outside the
function these days.
John Garry Jan. 25, 2024, 10:28 a.m. UTC | #5
> +
> +int blk_validate_limits(struct queue_limits *lim)
> +{
> +	unsigned int max_hw_sectors;
> +
> +	if (!lim->logical_block_size)
> +		lim->logical_block_size = SECTOR_SIZE;

nit: This function is doing a bit more than validating, as the function 
name suggests

> +
> +	if (!lim->physical_block_size)
> +		lim->physical_block_size = SECTOR_SIZE;
> +	if (lim->physical_block_size < lim->logical_block_size)
> +		lim->physical_block_size = lim->physical_block_size;
> +
> +	if (!lim->io_min)
> +		lim->io_min = SECTOR_SIZE;
> +	if (lim->io_min < lim->physical_block_size)
> +		lim->io_min = lim->physical_block_size;
> +
> +	if (!lim->max_hw_sectors)
> +		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
> +	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))
> +		return -EINVAL;

The WARN_ON_ONCE usage is odd, as it will only ever WARN once for a 
specific q, while other queues associated with other drivers may have 
the same limit issue. But I suppose if one issue is fixed, then the 
other will make itself known...

> +
> +	lim->max_hw_sectors = round_down(lim->max_hw_sectors,
> +			lim->logical_block_size >> SECTOR_SHIFT);
> +
> +	max_hw_sectors = min_not_zero(lim->max_hw_sectors,
> +				lim->max_dev_sectors);
> +	if (lim->max_user_sectors) {
> +		if (lim->max_user_sectors > max_hw_sectors ||
> +		    lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
> +			return -EINVAL;
> +		lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
> +	} else {
> +		lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
> +	}
> +
>
> +/**
> + * queue_limits_commit_update - commit an atomic update of queue limits
> + * @q:		queue to update
> + * @lim:	limits to apply
> + *
> + * Apply the limits in @lim that were obtained from queue_limits_start_update()
> + * and updated by the caller to @q.
> + *
> + * Returns 0 if successful, else a negative error code.
> + */
> +int queue_limits_commit_update(struct request_queue *q,
> +		struct queue_limits *lim)
> +	__releases(q->limits_lock)
> +{
> +	int error = blk_validate_limits(lim);
> +
> +	if (!error) {
> +		q->limits = *lim;

this is duplicating what blk_alloc_queue() does

> +		if (q->disk)
> +			blk_apply_bdi_limits(q->disk->bdi, lim);
> +	}
> +	mutex_unlock(&q->limits_lock);
> +	return error;
> +}
> +EXPORT_SYMBOL_GPL(queue_limits_commit_update);
> +
>   /**
>    * blk_queue_bounce_limit - set bounce buffer limit for queue
>    * @q: the request queue for the device
> diff --git a/block/blk.h b/block/blk.h
> index 1ef920f72e0f87..58b5dbac2a487d 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -447,6 +447,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
>   		unpin_user_page(page);
>   }
>   
> +int blk_validate_limits(struct queue_limits *lim);
>   struct request_queue *blk_alloc_queue(int node_id);
>   
>   int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 4a2e82c7971c86..5b5d3b238de1e7 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -473,6 +473,7 @@ struct request_queue {
>   
>   	struct mutex		sysfs_lock;
>   	struct mutex		sysfs_dir_lock;
> +	struct mutex		limits_lock;
>   
>   	/*
>   	 * for reusing dead hctx instance in case of updating
> @@ -861,6 +862,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
>   	return chunk_sectors - (offset & (chunk_sectors - 1));
>   }
>   
> +/**
> + * queue_limits_start_update - start an atomic update of queue limits
> + * @q:		queue to update
> + *
> + * This functions starts an atomic update of the queue limits.  It takes a lock
> + * to prevent other updates and returns a snapshot of the current limits that
> + * the caller can modify.  The caller must call queue_limits_commit_update()
> + * to finish the update.
> + *
> + * Context: process context.  The caller must have frozen the queue or ensured
> + * that there is outstanding I/O by other means.
> + */
> +static inline struct queue_limits
> +queue_limits_start_update(struct request_queue *q)
> +	__acquires(q->limits_lock)
> +{
> +	mutex_lock(&q->limits_lock);
> +	return q->limits;
> +}
> +int queue_limits_commit_update(struct request_queue *q,
> +		struct queue_limits *lim);

It ain't so nice that the code for queue_limits_start_update() and 
queue_limits_commit_update() pair are in separate files.

> +
>   /*
>    * Access functions for manipulating queue properties
>    */
Christoph Hellwig Jan. 25, 2024, 2:35 p.m. UTC | #6
On Thu, Jan 25, 2024 at 10:28:49AM +0000, John Garry wrote:
>
>> +
>> +int blk_validate_limits(struct queue_limits *lim)
>> +{
>> +	unsigned int max_hw_sectors;
>> +
>> +	if (!lim->logical_block_size)
>> +		lim->logical_block_size = SECTOR_SIZE;
>
> nit: This function is doing a bit more than validating, as the function 
> name suggests

Well, it validates and fixes up.  But that sucks as a name.  If you
have a good naming suggestion I can pick it up, but I couldn't come
up with a better name.

>> +	if (!lim->physical_block_size)
>> +		lim->physical_block_size = SECTOR_SIZE;
>> +	if (lim->physical_block_size < lim->logical_block_size)
>> +		lim->physical_block_size = lim->physical_block_size;
>> +
>> +	if (!lim->io_min)
>> +		lim->io_min = SECTOR_SIZE;
>> +	if (lim->io_min < lim->physical_block_size)
>> +		lim->io_min = lim->physical_block_size;
>> +
>> +	if (!lim->max_hw_sectors)
>> +		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
>> +	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))
>> +		return -EINVAL;
>
> The WARN_ON_ONCE usage is odd, as it will only ever WARN once for a 
> specific q, while other queues associated with other drivers may have the 
> same limit issue. But I suppose if one issue is fixed, then the other will 
> make itself known...

Yeah.  The idea is to give a loud warning for the API misuse as this
is not an error a normal user action could trigger.

>> +	int error = blk_validate_limits(lim);
>> +
>> +	if (!error) {
>> +		q->limits = *lim;
>
> this is duplicating what blk_alloc_queue() does

I originally had another helper to do the limits assignment
and the blk_apply_bdi_limits.  But as only one caller needs
the blk_apply_bdi_limits it felt a little silly.

>> +static inline struct queue_limits
>> +queue_limits_start_update(struct request_queue *q)
>> +	__acquires(q->limits_lock)
>> +{
>> +	mutex_lock(&q->limits_lock);
>> +	return q->limits;
>> +}
>> +int queue_limits_commit_update(struct request_queue *q,
>> +		struct queue_limits *lim);
>
> It ain't so nice that the code for queue_limits_start_update() and 
> queue_limits_commit_update() pair are in separate files.

We do that for a lot of APIs where part is inline.  And I really do
want queue_limits_start_update inline as passing large structs by
value generates horrible code, and for this API to work it needs
to be returned by value.  In fact it probably should be __always_inline
to avoid gcc doing stupid thing with -Os.
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 11342af420d0c4..09f4a44a4aa3cc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -424,6 +424,7 @@  struct request_queue *blk_alloc_queue(int node_id)
 	mutex_init(&q->debugfs_mutex);
 	mutex_init(&q->sysfs_lock);
 	mutex_init(&q->sysfs_dir_lock);
+	mutex_init(&q->limits_lock);
 	mutex_init(&q->rq_qos_mutex);
 	spin_lock_init(&q->queue_lock);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index e872b0e168525e..b6692143ccf034 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,146 @@  static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
 	bdi->io_pages = lim->max_sectors >> (PAGE_SHIFT - 9);
 }
 
+static int blk_validate_zoned_limits(struct queue_limits *lim)
+{
+	if (!lim->zoned) {
+		if (WARN_ON_ONCE(lim->max_open_zones) ||
+		    WARN_ON_ONCE(lim->max_active_zones) ||
+		    WARN_ON_ONCE(lim->zone_write_granularity) ||
+		    WARN_ON_ONCE(lim->max_zone_append_sectors))
+			return -EINVAL;
+		return 0;
+	}
+
+	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
+		return -EINVAL;
+
+	if (lim->zone_write_granularity < lim->logical_block_size)
+		lim->zone_write_granularity = lim->logical_block_size;
+
+	if (lim->max_zone_append_sectors) {
+		if (WARN_ON_ONCE(!lim->chunk_sectors))
+			return -EINVAL;
+		lim->max_zone_append_sectors =
+			min3(lim->max_hw_sectors,
+			     lim->max_zone_append_sectors,
+			     lim->chunk_sectors);
+	}
+
+	return 0;
+}
+
+int blk_validate_limits(struct queue_limits *lim)
+{
+	unsigned int max_hw_sectors;
+
+	if (!lim->logical_block_size)
+		lim->logical_block_size = SECTOR_SIZE;
+
+	if (!lim->physical_block_size)
+		lim->physical_block_size = SECTOR_SIZE;
+	if (lim->physical_block_size < lim->logical_block_size)
+		lim->physical_block_size = lim->physical_block_size;
+
+	if (!lim->io_min)
+		lim->io_min = SECTOR_SIZE;
+	if (lim->io_min < lim->physical_block_size)
+		lim->io_min = lim->physical_block_size;
+
+	if (!lim->max_hw_sectors)
+		lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SIZE / SECTOR_SIZE))
+		return -EINVAL;
+
+	lim->max_hw_sectors = round_down(lim->max_hw_sectors,
+			lim->logical_block_size >> SECTOR_SHIFT);
+
+	max_hw_sectors = min_not_zero(lim->max_hw_sectors,
+				lim->max_dev_sectors);
+	if (lim->max_user_sectors) {
+		if (lim->max_user_sectors > max_hw_sectors ||
+		    lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
+			return -EINVAL;
+		lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
+	} else {
+		lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
+	}
+
+	lim->max_sectors = round_down(lim->max_sectors,
+			lim->logical_block_size >> SECTOR_SHIFT);
+
+	if (!lim->max_segments)
+		lim->max_segments = BLK_MAX_SEGMENTS;
+
+	lim->max_discard_sectors = lim->max_hw_discard_sectors;
+	if (!lim->max_discard_segments)
+		lim->max_discard_segments = 1;
+
+	if (lim->discard_granularity < lim->physical_block_size)
+		lim->discard_granularity = lim->physical_block_size;
+
+	if (!lim->seg_boundary_mask)
+		lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
+		return -EINVAL;
+
+	if (!lim->max_segment_size)
+		lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
+	if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
+		return -EINVAL;
+
+	/*
+	 * Devices that require a virtual boundary do not support scatter/gather
+	 * I/O natively, but instead require a descriptor list entry for each
+	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
+	 * of that they are not limited by our notion of "segment size".
+	 */
+	if (lim->virt_boundary_mask) {
+		if (WARN_ON_ONCE(lim->max_segment_size &&
+				 lim->max_segment_size != UINT_MAX))
+			return -EINVAL;
+		lim->max_segment_size = UINT_MAX;
+	}
+
+	if (!lim->dma_alignment)
+		lim->dma_alignment = SECTOR_SIZE - 1;
+	if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
+		return -EINVAL;
+
+	if (lim->alignment_offset) {
+		lim->alignment_offset &= (lim->physical_block_size - 1);
+		lim->misaligned = 0;
+	}
+
+	return blk_validate_zoned_limits(lim);
+}
+
+/**
+ * queue_limits_commit_update - commit an atomic update of queue limits
+ * @q:		queue to update
+ * @lim:	limits to apply
+ *
+ * Apply the limits in @lim that were obtained from queue_limits_start_update()
+ * and updated by the caller to @q.
+ *
+ * Returns 0 if successful, else a negative error code.
+ */
+int queue_limits_commit_update(struct request_queue *q,
+		struct queue_limits *lim)
+	__releases(q->limits_lock)
+{
+	int error = blk_validate_limits(lim);
+
+	if (!error) {
+		q->limits = *lim;
+		if (q->disk)
+			blk_apply_bdi_limits(q->disk->bdi, lim);
+	}
+	mutex_unlock(&q->limits_lock);
+	return error;
+}
+EXPORT_SYMBOL_GPL(queue_limits_commit_update);
+
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q: the request queue for the device
diff --git a/block/blk.h b/block/blk.h
index 1ef920f72e0f87..58b5dbac2a487d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -447,6 +447,7 @@  static inline void bio_release_page(struct bio *bio, struct page *page)
 		unpin_user_page(page);
 }
 
+int blk_validate_limits(struct queue_limits *lim);
 struct request_queue *blk_alloc_queue(int node_id);
 
 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a2e82c7971c86..5b5d3b238de1e7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -473,6 +473,7 @@  struct request_queue {
 
 	struct mutex		sysfs_lock;
 	struct mutex		sysfs_dir_lock;
+	struct mutex		limits_lock;
 
 	/*
 	 * for reusing dead hctx instance in case of updating
@@ -861,6 +862,28 @@  static inline unsigned int blk_chunk_sectors_left(sector_t offset,
 	return chunk_sectors - (offset & (chunk_sectors - 1));
 }
 
+/**
+ * queue_limits_start_update - start an atomic update of queue limits
+ * @q:		queue to update
+ *
+ * This functions starts an atomic update of the queue limits.  It takes a lock
+ * to prevent other updates and returns a snapshot of the current limits that
+ * the caller can modify.  The caller must call queue_limits_commit_update()
+ * to finish the update.
+ *
+ * Context: process context.  The caller must have frozen the queue or ensured
+ * that there is outstanding I/O by other means.
+ */
+static inline struct queue_limits
+queue_limits_start_update(struct request_queue *q)
+	__acquires(q->limits_lock)
+{
+	mutex_lock(&q->limits_lock);
+	return q->limits;
+}
+int queue_limits_commit_update(struct request_queue *q,
+		struct queue_limits *lim);
+
 /*
  * Access functions for manipulating queue properties
  */