diff mbox series

[v2,2/8] fs/buffer: try to use folio lock for pagecache lookups

Message ID 20250410014945.2140781-3-mcgrof@kernel.org (mailing list archive)
State New
Headers show
Series mm: enhance migration work around on noref buffer-heads | expand

Commit Message

Luis Chamberlain April 10, 2025, 1:49 a.m. UTC
From: Davidlohr Bueso <dave@stgolabs.net>

Callers of __find_get_block() may or may not allow for blocking
semantics, and is currently assumed that it will not. Layout
two paths based on this. Ultimately the i_private_lock scheme will
be used as a fallback in non-blocking contexts. Otherwise
always take the folio lock instead. The suggested trylock idea
is implemented, thereby potentially reducing i_private_lock
contention in addition to later enabling future migration support
around with large folios and noref migration.

No change in semantics. All lookup users are non-blocking.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/buffer.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

Comments

Jan Kara April 10, 2025, 2:38 p.m. UTC | #1
On Wed 09-04-25 18:49:39, Luis Chamberlain wrote:
> From: Davidlohr Bueso <dave@stgolabs.net>
> 
> Callers of __find_get_block() may or may not allow for blocking
> semantics, and is currently assumed that it will not. Layout
> two paths based on this. Ultimately the i_private_lock scheme will
> be used as a fallback in non-blocking contexts. Otherwise
> always take the folio lock instead. The suggested trylock idea
> is implemented, thereby potentially reducing i_private_lock
> contention in addition to later enabling future migration support
> around with large folios and noref migration.
> 
> No change in semantics. All lookup users are non-blocking.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>

...

> @@ -204,7 +195,19 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
>  	if (IS_ERR(folio))
>  		goto out;
>  
> -	spin_lock(&bd_mapping->i_private_lock);
> +	/*
> +	 * Folio lock protects the buffers. Callers that cannot block
> +	 * will fallback to serializing vs try_to_free_buffers() via
> +	 * the i_private_lock.
> +	 */
> +	if (!folio_trylock(folio)) {
> +		if (atomic) {
> +			spin_lock(&bd_mapping->i_private_lock);
> +			folio_locked = false;
> +		} else
> +			folio_lock(folio);
> +	}

Ewww, this is going to be pain. You will mostly use the folio_trylock() for
protecting the lookup, except when some insane workload / fuzzer manages to
trigger the other path which will lead to completely unreproducible bugs...
I'd rather do:

	if (atomic) {
		spin_lock(&bd_mapping->i_private_lock);
		folio_locked = false;
	} else {
		folio_lock(folio);
	}

I'd actually love to do something like:

	if (atomic) {
		if (!folio_trylock(folio))
			bail...
	} else {
		folio_lock(folio);
	}

but that may be just too radical this point and would need some serious
testing how frequent the trylock failures are. No point in blocking this
series with it. So just go with the deterministic use of i_private_lock for
atomic users for now.

								Honza

> +
>  	head = folio_buffers(folio);
>  	if (!head)
>  		goto out_unlock;
> @@ -236,7 +239,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
>  		       1 << blkbits);
>  	}
>  out_unlock:
> -	spin_unlock(&bd_mapping->i_private_lock);
> +	if (folio_locked)
> +		folio_unlock(folio);
> +	else
> +		spin_unlock(&bd_mapping->i_private_lock);
>  	folio_put(folio);
>  out:
>  	return ret;
> @@ -1388,14 +1394,15 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
>   * it in the LRU and mark it as accessed.  If it is not present then return
>   * NULL
>   */
> -struct buffer_head *
> -__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
> +static struct buffer_head *
> +find_get_block_common(struct block_device *bdev, sector_t block,
> +			unsigned size, bool atomic)
>  {
>  	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
>  
>  	if (bh == NULL) {
>  		/* __find_get_block_slow will mark the page accessed */
> -		bh = __find_get_block_slow(bdev, block);
> +		bh = __find_get_block_slow(bdev, block, atomic);
>  		if (bh)
>  			bh_lru_install(bh);
>  	} else
> @@ -1403,6 +1410,12 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
>  
>  	return bh;
>  }
> +
> +struct buffer_head *
> +__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
> +{
> +	return find_get_block_common(bdev, block, size, true);
> +}
>  EXPORT_SYMBOL(__find_get_block);
>  
>  /**
> -- 
> 2.47.2
>
Davidlohr Bueso April 10, 2025, 5:38 p.m. UTC | #2
On Thu, 10 Apr 2025, Jan Kara wrote:

>I'd rather do:
>
>	if (atomic) {
>		spin_lock(&bd_mapping->i_private_lock);
>		folio_locked = false;
>	} else {
>		folio_lock(folio);
>	}
>

Fine with me. I just think the trylock for the atomic scenario would have
given greater chances of successful migration, but at a lack of determinism,
of course.

>I'd actually love to do something like:
>
>	if (atomic) {
>		if (!folio_trylock(folio))
>			bail...
>	} else {
>		folio_lock(folio);
>	}
>
>but that may be just too radical this point and would need some serious
>testing how frequent the trylock failures are. No point in blocking this
>series with it. So just go with the deterministic use of i_private_lock for
>atomic users for now.

This acually crossed my mind, but I also considered the scheme a little
too much for this series.
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index c7abb4a029dc..5a1a37a6840a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -176,18 +176,8 @@  void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 }
 EXPORT_SYMBOL(end_buffer_write_sync);
 
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers.  To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high.  This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
 static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
 {
 	struct address_space *bd_mapping = bdev->bd_mapping;
 	const int blkbits = bd_mapping->host->i_blkbits;
@@ -197,6 +187,7 @@  __find_get_block_slow(struct block_device *bdev, sector_t block)
 	struct buffer_head *head;
 	struct folio *folio;
 	int all_mapped = 1;
+	bool folio_locked = true;
 	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 
 	index = ((loff_t)block << blkbits) / PAGE_SIZE;
@@ -204,7 +195,19 @@  __find_get_block_slow(struct block_device *bdev, sector_t block)
 	if (IS_ERR(folio))
 		goto out;
 
-	spin_lock(&bd_mapping->i_private_lock);
+	/*
+	 * Folio lock protects the buffers. Callers that cannot block
+	 * will fallback to serializing vs try_to_free_buffers() via
+	 * the i_private_lock.
+	 */
+	if (!folio_trylock(folio)) {
+		if (atomic) {
+			spin_lock(&bd_mapping->i_private_lock);
+			folio_locked = false;
+		} else
+			folio_lock(folio);
+	}
+
 	head = folio_buffers(folio);
 	if (!head)
 		goto out_unlock;
@@ -236,7 +239,10 @@  __find_get_block_slow(struct block_device *bdev, sector_t block)
 		       1 << blkbits);
 	}
 out_unlock:
-	spin_unlock(&bd_mapping->i_private_lock);
+	if (folio_locked)
+		folio_unlock(folio);
+	else
+		spin_unlock(&bd_mapping->i_private_lock);
 	folio_put(folio);
 out:
 	return ret;
@@ -1388,14 +1394,15 @@  lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
  * it in the LRU and mark it as accessed.  If it is not present then return
  * NULL
  */
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+			unsigned size, bool atomic)
 {
 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
 
 	if (bh == NULL) {
 		/* __find_get_block_slow will mark the page accessed */
-		bh = __find_get_block_slow(bdev, block);
+		bh = __find_get_block_slow(bdev, block, atomic);
 		if (bh)
 			bh_lru_install(bh);
 	} else
@@ -1403,6 +1410,12 @@  __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 
 	return bh;
 }
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+	return find_get_block_common(bdev, block, size, true);
+}
 EXPORT_SYMBOL(__find_get_block);
 
 /**