diff mbox series

[v3,09/30] block: Pre-allocate zone write plugs

Message ID 20240328004409.594888-10-dlemoal@kernel.org (mailing list archive)
State New, archived
Headers show
Series Zone write plugging | expand

Commit Message

Damien Le Moal March 28, 2024, 12:43 a.m. UTC
Allocating zone write plugs using kmalloc() does not guarantee that
enough write plugs can be allocated to simultaneously write up to
the maximum number of active zones or maximum number of open zones of
a zoned block device.

Avoid any issue with memory allocation by pre-allocating zone write
plugs up to the disk maximum number of open zones or maximum number of
active zones, whichever is larger. For zoned devices that do not have
open or active zone limits, the default 128 is used as the number of
write plugs to pre-allocate.

Pre-allocated zone write plugs are managed using a free list. If a
change to the device zone limits is detected, the disk free list is
grown if needed when blk_revalidate_disk_zones() is executed.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 block/blk-zoned.c      | 124 ++++++++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h |   2 +
 2 files changed, 113 insertions(+), 13 deletions(-)

Comments

Christoph Hellwig March 28, 2024, 4:30 a.m. UTC | #1
I think this should go into the previous patch, splitting it
out just causes confusion.

> +static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
> +{
> +	struct gendisk *disk = zwplug->disk;
> +	unsigned long flags;
> +
> +	if (zwplug->flags & BLK_ZONE_WPLUG_NEEDS_FREE) {
> +		kfree(zwplug);
> +		return;
> +	}
> +
> +	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
> +	list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
> +	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
> +}
> +
>  static bool disk_insert_zone_wplug(struct gendisk *disk,
>  				   struct blk_zone_wplug *zwplug)
>  {
> @@ -630,18 +665,24 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
>  	return zwplug;
>  }
>  
> +static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
> +{
> +	struct blk_zone_wplug *zwplug =
> +		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
> +
> +	disk_free_zone_wplug(zwplug);
> +}

Please verify my idea carefully, but I think we can do without the
RCU grace period and thus the rcu_head in struct blk_zone_wplug:

When the zwplug is removed from the hash, we set the
BLK_ZONE_WPLUG_UNHASHED flag under disk->zone_wplugs_lock.  Once
caller see that flag any lookup that modifies the structure
will fail/wait.  If we then just clear BLK_ZONE_WPLUG_UNHASHED after
the final put in disk_put_zone_wplug when we know the bio list is
empty and no other state is kept (if there might be flags left
we should clear them before), it is perfectly fine for the
zwplug to get reused for another zone at this point.
Damien Le Moal March 28, 2024, 5:28 a.m. UTC | #2
On 3/28/24 13:30, Christoph Hellwig wrote:
> I think this should go into the previous patch, splitting it
> out just causes confusion.
> 
>> +static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
>> +{
>> +	struct gendisk *disk = zwplug->disk;
>> +	unsigned long flags;
>> +
>> +	if (zwplug->flags & BLK_ZONE_WPLUG_NEEDS_FREE) {
>> +		kfree(zwplug);
>> +		return;
>> +	}
>> +
>> +	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
>> +	list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
>> +	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
>> +}
>> +
>>  static bool disk_insert_zone_wplug(struct gendisk *disk,
>>  				   struct blk_zone_wplug *zwplug)
>>  {
>> @@ -630,18 +665,24 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
>>  	return zwplug;
>>  }
>>  
>> +static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
>> +{
>> +	struct blk_zone_wplug *zwplug =
>> +		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
>> +
>> +	disk_free_zone_wplug(zwplug);
>> +}
> 
> Please verify my idea carefully, but I think we can do without the
> RCU grace period and thus the rcu_head in struct blk_zone_wplug:
> 
> When the zwplug is removed from the hash, we set the
> BLK_ZONE_WPLUG_UNHASHED flag under disk->zone_wplugs_lock.  Once
> caller see that flag any lookup that modifies the structure
> will fail/wait.  If we then just clear BLK_ZONE_WPLUG_UNHASHED after
> the final put in disk_put_zone_wplug when we know the bio list is
> empty and no other state is kept (if there might be flags left
> we should clear them before), it is perfectly fine for the
> zwplug to get reused for another zone at this point.

That was my thinking initially as well, which is why I did not have the grace
period. However, getting a reference on a plug is a not done under
disk->zone_wplugs_lock and is thus racy, albeit with a super tiny time window:
the hash table lookup may "see" a plug that has already been removed and has a
refcount dropped to 0 already. The use of atomic_inc_not_zero() prevents us from
trying to keep using that stale plug, but we *are* referencing it. So without
the grace period, I think there is a risk (again, super tiny window) that we
start reusing the plug, or kfree it while atomic_inc_not_zero() is executing...
I am overthinking this ?
Christoph Hellwig March 28, 2024, 5:46 a.m. UTC | #3
On Thu, Mar 28, 2024 at 02:28:40PM +0900, Damien Le Moal wrote:
> That was my thinking initially as well, which is why I did not have the
> grace period. However, getting a reference on a plug is a not done under
> disk->zone_wplugs_lock and is thus racy, albeit with a super tiny time
> window: the hash table lookup may "see" a plug that has already been
> removed and has a refcount dropped to 0 already. The use of
> atomic_inc_not_zero() prevents us from trying to keep using that stale
> plug, but we *are* referencing it. So without the grace period, I think
> there is a risk (again, super tiny window) that we start reusing the
> plug, or kfree it while atomic_inc_not_zero() is executing...
> I am overthinking this ?

Well.  All the lookups fail (or should fail) when BLK_ZONE_WPLUG_UNHASHED
is set, probably even before even trying to grab a reference.  So all
the lookups for a zone that is beeing torn down will fail.  Now once
the actual final reference is dropped, we'll now need to clear
BLK_ZONE_WPLUG_UNHASHED and lookup can happe again.  We'd have a race
window there, but I guess we can plug it by checking for the right
zone number?  If we it while it already got reduce that'll still fail
the lookup.
Damien Le Moal March 28, 2024, 6:02 a.m. UTC | #4
On 3/28/24 14:46, Christoph Hellwig wrote:
> On Thu, Mar 28, 2024 at 02:28:40PM +0900, Damien Le Moal wrote:
>> That was my thinking initially as well, which is why I did not have the
>> grace period. However, getting a reference on a plug is a not done under
>> disk->zone_wplugs_lock and is thus racy, albeit with a super tiny time
>> window: the hash table lookup may "see" a plug that has already been
>> removed and has a refcount dropped to 0 already. The use of
>> atomic_inc_not_zero() prevents us from trying to keep using that stale
>> plug, but we *are* referencing it. So without the grace period, I think
>> there is a risk (again, super tiny window) that we start reusing the
>> plug, or kfree it while atomic_inc_not_zero() is executing...
>> I am overthinking this ?
> 
> Well.  All the lookups fail (or should fail) when BLK_ZONE_WPLUG_UNHASHED
> is set, probably even before even trying to grab a reference.  So all> the lookups for a zone that is beeing torn down will fail.  Now once
> the actual final reference is dropped, we'll now need to clear
> BLK_ZONE_WPLUG_UNHASHED and lookup can happe again.  We'd have a race
> window there, but I guess we can plug it by checking for the right
> zone number?  If we it while it already got reduce that'll still fail
> the lookup.

But that is the problem: "checking the zone number again" means referencing the
plug struct again from the lookup context while the last ref drop context is
freeing the plug. That race can be lost by the lookup context and lead to
referencing freed memory. So your solution would be OK for pre-allocated plugs
only. For kmalloc-ed() plugs, we still need the rcu grace period for free. So we
can only optimize for the pre-allocated plugs...
Christoph Hellwig March 28, 2024, 6:03 a.m. UTC | #5
On Thu, Mar 28, 2024 at 03:02:54PM +0900, Damien Le Moal wrote:
> But that is the problem: "checking the zone number again" means referencing the
> plug struct again from the lookup context while the last ref drop context is
> freeing the plug. That race can be lost by the lookup context and lead to
> referencing freed memory. So your solution would be OK for pre-allocated plugs
> only.

Not if it is done in the Rcu critical section.

> For kmalloc-ed() plugs, we still need the rcu grace period for free. So we
> can only optimize for the pre-allocated plugs...

Yes, bt it can use kfree_rcu which doesn't need the rcu_head in the
zwplug.
Damien Le Moal March 28, 2024, 6:18 a.m. UTC | #6
On 3/28/24 15:03, Christoph Hellwig wrote:
> On Thu, Mar 28, 2024 at 03:02:54PM +0900, Damien Le Moal wrote:
>> But that is the problem: "checking the zone number again" means referencing the
>> plug struct again from the lookup context while the last ref drop context is
>> freeing the plug. That race can be lost by the lookup context and lead to
>> referencing freed memory. So your solution would be OK for pre-allocated plugs
>> only.
> 
> Not if it is done in the Rcu critical section.
> 
>> For kmalloc-ed() plugs, we still need the rcu grace period for free. So we
>> can only optimize for the pre-allocated plugs...
> 
> Yes, bt it can use kfree_rcu which doesn't need the rcu_head in the
> zwplug.

Unfortunately, it does. kfree_rcu() is a 2 argument macro: address and rcu head
to use... The only thing we could drop from the plug struct is the gendisk pointer.
Christoph Hellwig March 28, 2024, 6:22 a.m. UTC | #7
On Thu, Mar 28, 2024 at 03:18:46PM +0900, Damien Le Moal wrote:
> > Yes, bt it can use kfree_rcu which doesn't need the rcu_head in the
> > zwplug.
> 
> Unfortunately, it does. kfree_rcu() is a 2 argument macro: address and rcu head
> to use... The only thing we could drop from the plug struct is the gendisk pointer.

It used to have a one argument version.  Oh, that recently got renamed
to kfree_rcu_mightsleep.  Which seems like a somewhat odd name, but
it's still there and what i meant.
Damien Le Moal March 28, 2024, 6:33 a.m. UTC | #8
On 3/28/24 15:22, Christoph Hellwig wrote:
> On Thu, Mar 28, 2024 at 03:18:46PM +0900, Damien Le Moal wrote:
>>> Yes, bt it can use kfree_rcu which doesn't need the rcu_head in the
>>> zwplug.
>>
>> Unfortunately, it does. kfree_rcu() is a 2 argument macro: address and rcu head
>> to use... The only thing we could drop from the plug struct is the gendisk pointer.
> 
> It used to have a one argument version.  Oh, that recently got renamed
> to kfree_rcu_mightsleep.  Which seems like a somewhat odd name, but
> it's still there and what i meant.

Ha. OK. I did not see that one. But that means that the plug kfree() can then
block the caller. Given that the last ref drop may happen from BIO completion
context (when the last write to a zone making the zone full complete), I do not
think we can use this function...
Christoph Hellwig March 28, 2024, 6:38 a.m. UTC | #9
On Thu, Mar 28, 2024 at 03:33:13PM +0900, Damien Le Moal wrote:
> Ha. OK. I did not see that one. But that means that the plug kfree() can then
> block the caller. Given that the last ref drop may happen from BIO completion
> context (when the last write to a zone making the zone full complete), I do not
> think we can use this function...

Ah, damn.  So yes, we probably still need the rcu head.  We can kill
the gendisk pointer, though.  Or just stick with the existing version
and don't bother with the micro-optimization, at which point the
mempool might actually be the simpler implementation?
Damien Le Moal March 28, 2024, 6:51 a.m. UTC | #10
On 3/28/24 15:38, Christoph Hellwig wrote:
> On Thu, Mar 28, 2024 at 03:33:13PM +0900, Damien Le Moal wrote:
>> Ha. OK. I did not see that one. But that means that the plug kfree() can then
>> block the caller. Given that the last ref drop may happen from BIO completion
>> context (when the last write to a zone making the zone full complete), I do not
>> think we can use this function...
> 
> Ah, damn.  So yes, we probably still need the rcu head.  We can kill
> the gendisk pointer, though.  Or just stick with the existing version
> and don't bother with the micro-optimization, at which point the
> mempool might actually be the simpler implementation?

I am all for not micro-optimizing the free path right now.
I am not so sure about the mempool being simpler... And I do see some
improvements in perf for SMR HDDs with the free list. Could be noise though but
it feels a little more solid perf-wise. I have not seen any benefit for faster
devices with the free list though...

If you prefer the mempool, I can go back to using it though, not a big deal.

For other micro-optimizations worth looking at later would be to try out the new
low latency workqueues for the plug BIO work.
Christoph Hellwig March 28, 2024, 6:52 a.m. UTC | #11
On Thu, Mar 28, 2024 at 03:51:09PM +0900, Damien Le Moal wrote:
> I am all for not micro-optimizing the free path right now.
> I am not so sure about the mempool being simpler... And I do see some
> improvements in perf for SMR HDDs with the free list. Could be noise though but
> it feels a little more solid perf-wise. I have not seen any benefit for faster
> devices with the free list though...
> 
> If you prefer the mempool, I can go back to using it though, not a big deal.

A capped free list + dynamic allocation beyond it is exactly what the
mempool is, so reimplementing seems a bit silly.
Damien Le Moal March 28, 2024, 6:53 a.m. UTC | #12
On 3/28/24 15:52, Christoph Hellwig wrote:
> On Thu, Mar 28, 2024 at 03:51:09PM +0900, Damien Le Moal wrote:
>> I am all for not micro-optimizing the free path right now.
>> I am not so sure about the mempool being simpler... And I do see some
>> improvements in perf for SMR HDDs with the free list. Could be noise though but
>> it feels a little more solid perf-wise. I have not seen any benefit for faster
>> devices with the free list though...
>>
>> If you prefer the mempool, I can go back to using it though, not a big deal.
> 
> A capped free list + dynamic allocation beyond it is exactly what the
> mempool is, so reimplementing seems a bit silly.

OK. Putting it back then.
Bart Van Assche March 28, 2024, 10:25 p.m. UTC | #13
On 3/27/24 9:30 PM, Christoph Hellwig wrote:
> Please verify my idea carefully, but I think we can do without the
> RCU grace period and thus the rcu_head in struct blk_zone_wplug:
> 
> When the zwplug is removed from the hash, we set the
> BLK_ZONE_WPLUG_UNHASHED flag under disk->zone_wplugs_lock.  Once
> caller see that flag any lookup that modifies the structure
> will fail/wait.  If we then just clear BLK_ZONE_WPLUG_UNHASHED after
> the final put in disk_put_zone_wplug when we know the bio list is
> empty and no other state is kept (if there might be flags left
> we should clear them before), it is perfectly fine for the
> zwplug to get reused for another zone at this point.

Hi Christoph,

I don't think this is allowed without grace period between kfree()
and reusing a zwplug because another thread might be iterating over
the hlist while only holding an RCU reader lock.

Thanks,

Bart.
Bart Van Assche March 28, 2024, 10:29 p.m. UTC | #14
On 3/27/24 5:43 PM, Damien Le Moal wrote:
> Allocating zone write plugs using kmalloc() does not guarantee that
> enough write plugs can be allocated to simultaneously write up to
> the maximum number of active zones or maximum number of open zones of
> a zoned block device.
> 
> Avoid any issue with memory allocation by pre-allocating zone write
> plugs up to the disk maximum number of open zones or maximum number of
> active zones, whichever is larger. For zoned devices that do not have
> open or active zone limits, the default 128 is used as the number of
> write plugs to pre-allocate.
> 
> Pre-allocated zone write plugs are managed using a free list. If a
> change to the device zone limits is detected, the disk free list is
> grown if needed when blk_revalidate_disk_zones() is executed.

Is there a way to retry bio submission if allocating a zone write plug
fails? Would that make it possible to drop this patch?

Thanks,

Bart.
Damien Le Moal March 28, 2024, 10:33 p.m. UTC | #15
On 3/29/24 07:29, Bart Van Assche wrote:
> On 3/27/24 5:43 PM, Damien Le Moal wrote:
>> Allocating zone write plugs using kmalloc() does not guarantee that
>> enough write plugs can be allocated to simultaneously write up to
>> the maximum number of active zones or maximum number of open zones of
>> a zoned block device.
>>
>> Avoid any issue with memory allocation by pre-allocating zone write
>> plugs up to the disk maximum number of open zones or maximum number of
>> active zones, whichever is larger. For zoned devices that do not have
>> open or active zone limits, the default 128 is used as the number of
>> write plugs to pre-allocate.
>>
>> Pre-allocated zone write plugs are managed using a free list. If a
>> change to the device zone limits is detected, the disk free list is
>> grown if needed when blk_revalidate_disk_zones() is executed.
> 
> Is there a way to retry bio submission if allocating a zone write plug
> fails? Would that make it possible to drop this patch?

This patch is merged into the main zone write plugging patch in v4 (about to
post it) and the free list is replaced with a mempool.
Note that for BIOs that do not have REQ_NOWAIT, the allocation is done with
GFP_NIO. If that fails, the OOM killer is probably already wreaking the system...

> 
> Thanks,
> 
> Bart.
>
diff mbox series

Patch

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 03083522df84..3084dae5408e 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -39,7 +39,8 @@  static const char *const zone_cond_name[] = {
 /*
  * Per-zone write plug.
  * @node: hlist_node structure for managing the plug using a hash table.
- * @link: To list the plug in the zone write plug error list of the disk.
+ * @link: To list the plug in the zone write plug free list or error list of
+ *        the disk.
  * @ref: Zone write plug reference counter. A zone write plug reference is
  *       always at least 1 when the plug is hashed in the disk plug hash table.
  *       The reference is incremented whenever a new BIO needing plugging is
@@ -57,6 +58,7 @@  static const char *const zone_cond_name[] = {
  * @bio_list: The list of BIOs that are currently plugged.
  * @bio_work: Work struct to handle issuing of plugged BIOs
  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
  */
 struct blk_zone_wplug {
 	struct hlist_node	node;
@@ -69,6 +71,7 @@  struct blk_zone_wplug {
 	struct bio_list		bio_list;
 	struct work_struct	bio_work;
 	struct rcu_head		rcu_head;
+	struct gendisk		*disk;
 };
 
 /*
@@ -85,10 +88,14 @@  struct blk_zone_wplug {
  *    to prevent new references to the zone write plug to be taken for
  *    newly incoming BIOs. A zone write plug flagged with this flag will be
  *    freed once all remaining references from BIOs or functions are dropped.
+ *  - BLK_ZONE_WPLUG_NEEDS_FREE: Indicates that the zone write plug was
+ *    dynamically allocated and needs to be freed instead of returned to the
+ *    free list of zone write plugs of the disk.
  */
 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
 #define BLK_ZONE_WPLUG_ERROR		(1U << 1)
 #define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
+#define BLK_ZONE_WPLUG_NEEDS_FREE	(1U << 3)
 
 #define BLK_ZONE_WPLUG_BUSY	(BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
 
@@ -519,23 +526,51 @@  static void disk_init_zone_wplug(struct gendisk *disk,
 	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
 	bio_list_init(&zwplug->bio_list);
 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+	zwplug->disk = disk;
 }
 
 static struct blk_zone_wplug *disk_alloc_zone_wplug(struct gendisk *disk,
 						sector_t sector, gfp_t gfp_mask)
 {
-	struct blk_zone_wplug *zwplug;
+	struct blk_zone_wplug *zwplug = NULL;
+	unsigned int zwp_flags = 0;
+	unsigned long flags;
 
-	/* Allocate a new zone write plug. */
-	zwplug = kmalloc(sizeof(struct blk_zone_wplug), gfp_mask);
-	if (!zwplug)
-		return NULL;
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	zwplug = list_first_entry_or_null(&disk->zone_wplugs_free_list,
+					  struct blk_zone_wplug, link);
+	if (zwplug)
+		list_del_init(&zwplug->link);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 
-	disk_init_zone_wplug(disk, zwplug, 0, sector);
+	if (!zwplug) {
+		/* Allocate a new zone write plug. */
+		zwplug = kmalloc(sizeof(struct blk_zone_wplug), gfp_mask);
+		if (!zwplug)
+			return NULL;
+		zwp_flags = BLK_ZONE_WPLUG_NEEDS_FREE;
+	}
+
+	disk_init_zone_wplug(disk, zwplug, zwp_flags, sector);
 
 	return zwplug;
 }
 
+static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+	struct gendisk *disk = zwplug->disk;
+	unsigned long flags;
+
+	if (zwplug->flags & BLK_ZONE_WPLUG_NEEDS_FREE) {
+		kfree(zwplug);
+		return;
+	}
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
 static bool disk_insert_zone_wplug(struct gendisk *disk,
 				   struct blk_zone_wplug *zwplug)
 {
@@ -630,18 +665,24 @@  static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
 	return zwplug;
 }
 
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+	disk_free_zone_wplug(zwplug);
+}
+
 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
 {
 	if (atomic_dec_and_test(&zwplug->ref)) {
 		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
 		WARN_ON_ONCE(!list_empty(&zwplug->link));
 
-		kfree_rcu(zwplug, rcu_head);
+		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
 	}
 }
 
-static void blk_zone_wplug_bio_work(struct work_struct *work);
-
 /*
  * Get a reference on the write plug for the zone containing @sector.
  * If the plug does not exist, it is allocated and hashed.
@@ -684,7 +725,7 @@  static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 	 */
 	if (!disk_insert_zone_wplug(disk, zwplug)) {
 		spin_unlock_irqrestore(&zwplug->lock, *flags);
-		kfree(zwplug);
+		disk_free_zone_wplug(zwplug);
 		goto again;
 	}
 
@@ -1401,6 +1442,30 @@  static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
 	return 1U << disk->zone_wplugs_hash_bits;
 }
 
+static int disk_alloc_zone_wplugs(struct gendisk *disk,
+				  unsigned int max_nr_zwplugs)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned int i;
+
+	if (!disk->zone_wplugs_hash)
+		return 0;
+
+	/* Pre-allocate zone write plugs */
+	for (i = 0; i < max_nr_zwplugs; i++) {
+		zwplug = kmalloc(sizeof(struct blk_zone_wplug), GFP_KERNEL);
+		if (!zwplug)
+			return -ENOMEM;
+		disk_init_zone_wplug(disk, zwplug, 0, 0);
+
+		list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
+	}
+
+	disk->zone_wplugs_max_nr += max_nr_zwplugs;
+
+	return 0;
+}
+
 static void disk_free_zone_wplugs(struct gendisk *disk)
 {
 	struct blk_zone_wplug *zwplug;
@@ -1422,11 +1487,22 @@  static void disk_free_zone_wplugs(struct gendisk *disk)
 
 	/* Wait for the zone write plugs to be RCU-freed. */
 	rcu_barrier();
+
+	while (!list_empty(&disk->zone_wplugs_free_list)) {
+		zwplug = list_first_entry(&disk->zone_wplugs_free_list,
+					  struct blk_zone_wplug, link);
+		list_del_init(&zwplug->link);
+
+		kfree(zwplug);
+	}
+
+	disk->zone_wplugs_max_nr = 0;
 }
 
 void disk_init_zone_resources(struct gendisk *disk)
 {
 	spin_lock_init(&disk->zone_wplugs_lock);
+	INIT_LIST_HEAD(&disk->zone_wplugs_free_list);
 	INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
 	INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
 }
@@ -1444,6 +1520,7 @@  static int disk_alloc_zone_resources(struct gendisk *disk,
 				     unsigned int max_nr_zwplugs)
 {
 	unsigned int i;
+	int ret;
 
 	disk->zone_wplugs_hash_bits =
 		min(ilog2(max_nr_zwplugs) + 1, BLK_ZONE_MAX_WPLUG_HASH_BITS);
@@ -1457,6 +1534,15 @@  static int disk_alloc_zone_resources(struct gendisk *disk,
 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
 		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
 
+	ret = disk_alloc_zone_wplugs(disk, max_nr_zwplugs);
+	if (ret) {
+		disk_free_zone_wplugs(disk);
+		kfree(disk->zone_wplugs_hash);
+		disk->zone_wplugs_hash = NULL;
+		disk->zone_wplugs_hash_bits = 0;
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -1484,6 +1570,7 @@  static int disk_revalidate_zone_resources(struct gendisk *disk,
 {
 	struct queue_limits *lim = &disk->queue->limits;
 	unsigned int max_nr_zwplugs;
+	int ret;
 
 	/*
 	 * If the device has no limit on the maximum number of open and active
@@ -1495,8 +1582,19 @@  static int disk_revalidate_zone_resources(struct gendisk *disk,
 		max_nr_zwplugs =
 			min(BLK_ZONE_DEFAULT_MAX_NR_WPLUGS, nr_zones);
 
-	if (!disk->zone_wplugs_hash)
-		return disk_alloc_zone_resources(disk, max_nr_zwplugs);
+	if (!disk->zone_wplugs_hash) {
+		ret = disk_alloc_zone_resources(disk, max_nr_zwplugs);
+		if (ret)
+			return ret;
+	}
+
+	/* Grow the free list of zone write plugs if needed. */
+	if (disk->zone_wplugs_max_nr < max_nr_zwplugs) {
+		ret = disk_alloc_zone_wplugs(disk,
+				max_nr_zwplugs - disk->zone_wplugs_max_nr);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6faa1abe8506..962ee0496659 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,9 +194,11 @@  struct gendisk {
 	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
+	unsigned int		zone_wplugs_max_nr;
 	unsigned int            zone_wplugs_hash_bits;
 	spinlock_t              zone_wplugs_lock;
 	struct hlist_head       *zone_wplugs_hash;
+	struct list_head        zone_wplugs_free_list;
 	struct list_head        zone_wplugs_err_list;
 	struct work_struct	zone_wplugs_work;
 #endif /* CONFIG_BLK_DEV_ZONED */