diff mbox series

[v4] fs/buffer.c: Revoke LRU when trying to drop buffers

Message ID e8f3e042b902156467a5e978b57c14954213ec59.1611642039.git.cgoldswo@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series [v4] fs/buffer.c: Revoke LRU when trying to drop buffers | expand

Commit Message

Chris Goldsworthy Jan. 26, 2021, 6:58 a.m. UTC
From: Laura Abbott <lauraa@codeaurora.org>

When a buffer is added to the LRU list, a reference is taken which is
not dropped until the buffer is evicted from the LRU list. This is the
correct behavior, however this LRU reference will prevent the buffer
from being dropped. This means that the buffer can't actually be dropped
until it is selected for eviction. There's no bound on the time spent
on the LRU list, which means that the buffer may be undroppable for
very long periods of time. Given that migration involves dropping
buffers, the associated page is now unmigratible for long periods of
time as well. CMA relies on being able to migrate a specific range
of pages, so these types of failures make CMA significantly
less reliable, especially under high filesystem usage.

Rather than waiting for the LRU algorithm to eventually kick out
the buffer, explicitly remove the buffer from the LRU list when trying
to drop it. There is still the possibility that the buffer
could be added back on the list, but that indicates the buffer is
still in use and would probably have other 'in use' indicates to
prevent dropping.

Note: a bug reported by "kernel test robot" lead to a switch from
using xas_for_each() to xa_for_each().

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Signed-off-by: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Matthew Wilcox <willy@infradead.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
---
 fs/buffer.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 74 insertions(+), 5 deletions(-)

Comments

Minchan Kim Jan. 26, 2021, 10:59 p.m. UTC | #1
On Mon, Jan 25, 2021 at 10:58:30PM -0800, Chris Goldsworthy wrote:
> From: Laura Abbott <lauraa@codeaurora.org>
> 
> When a buffer is added to the LRU list, a reference is taken which is
> not dropped until the buffer is evicted from the LRU list. This is the
> correct behavior, however this LRU reference will prevent the buffer
> from being dropped. This means that the buffer can't actually be dropped
> until it is selected for eviction. There's no bound on the time spent
> on the LRU list, which means that the buffer may be undroppable for
> very long periods of time. Given that migration involves dropping
> buffers, the associated page is now unmigratible for long periods of
> time as well. CMA relies on being able to migrate a specific range
> of pages, so these types of failures make CMA significantly
> less reliable, especially under high filesystem usage.
> 
> Rather than waiting for the LRU algorithm to eventually kick out
> the buffer, explicitly remove the buffer from the LRU list when trying
> to drop it. There is still the possibility that the buffer
> could be added back on the list, but that indicates the buffer is
> still in use and would probably have other 'in use' indicates to
> prevent dropping.
> 
> Note: a bug reported by "kernel test robot" lead to a switch from
> using xas_for_each() to xa_for_each().
> 
> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
> Signed-off-by: Chris Goldsworthy <cgoldswo@codeaurora.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Reported-by: kernel test robot <oliver.sang@intel.com>

Hi Chris,

The release buffer_head in LRU is great improvement for migration
point of view.

A question: 

Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep or
elsewhere when migration found the failure and is about to retry?

Migration has done such a way for other per-cpu stuffs for a long time,
which would be more consistent with others and might be faster sometimes
with reducing IPI calls for page.


> ---
>  fs/buffer.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 74 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 96c7604..27516a0 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -48,6 +48,7 @@
>  #include <linux/sched/mm.h>
>  #include <trace/events/block.h>
>  #include <linux/fscrypt.h>
> +#include <linux/xarray.h>
>  
>  #include "internal.h"
>  
> @@ -1471,12 +1472,55 @@ static bool has_bh_in_lru(int cpu, void *dummy)
>  	return false;
>  }
>  
> +static void __evict_bhs_lru(void *arg)
> +{
> +	struct bh_lru *b = &get_cpu_var(bh_lrus);
> +	struct xarray *busy_bhs = arg;
> +	struct buffer_head *bh;
> +	unsigned long i, xarray_index;
> +
> +	xa_for_each(busy_bhs, xarray_index, bh) {
> +		for (i = 0; i < BH_LRU_SIZE; i++) {
> +			if (b->bhs[i] == bh) {
> +				brelse(b->bhs[i]);
> +				b->bhs[i] = NULL;
> +				break;
> +			}
> +		}
> +	}
> +
> +	put_cpu_var(bh_lrus);
> +}
> +
> +static bool page_has_bhs_in_lru(int cpu, void *arg)
> +{
> +	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
> +	struct xarray *busy_bhs = arg;
> +	struct buffer_head *bh;
> +	unsigned long i, xarray_index;
> +
> +	xa_for_each(busy_bhs, xarray_index, bh) {
> +		for (i = 0; i < BH_LRU_SIZE; i++) {
> +			if (b->bhs[i] == bh)
> +				return true;
> +		}
> +	}
> +
> +	return false;
> +
> +}
>  void invalidate_bh_lrus(void)
>  {
>  	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
>  }
>  EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
>  
> +static void evict_bh_lrus(struct xarray *busy_bhs)
> +{
> +	on_each_cpu_cond(page_has_bhs_in_lru, __evict_bhs_lru,
> +			 busy_bhs, 1);
> +}
> +
>  void set_bh_page(struct buffer_head *bh,
>  		struct page *page, unsigned long offset)
>  {
> @@ -3242,14 +3286,38 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
>  {
>  	struct buffer_head *head = page_buffers(page);
>  	struct buffer_head *bh;
> +	struct xarray busy_bhs;
> +	int bh_count = 0;
> +	int xa_ret, ret = 0;
> +
> +	xa_init(&busy_bhs);
>  
>  	bh = head;
>  	do {
> -		if (buffer_busy(bh))
> -			goto failed;
> +		if (buffer_busy(bh)) {
> +			xa_ret = xa_err(xa_store(&busy_bhs, bh_count++,
> +						 bh, GFP_ATOMIC));
> +			if (xa_ret)
> +				goto out;
> +		}
>  		bh = bh->b_this_page;
>  	} while (bh != head);
>  
> +	if (bh_count) {
> +		/*
> +		 * Check if the busy failure was due to an outstanding
> +		 * LRU reference
> +		 */
> +		evict_bh_lrus(&busy_bhs);
> +		do {
> +			if (buffer_busy(bh))
> +				goto out;
> +
> +			bh = bh->b_this_page;
> +		} while (bh != head);
> +	}
> +
> +	ret = 1;
>  	do {
>  		struct buffer_head *next = bh->b_this_page;
>  
> @@ -3259,9 +3327,10 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
>  	} while (bh != head);
>  	*buffers_to_free = head;
>  	detach_page_private(page);
> -	return 1;
> -failed:
> -	return 0;
> +out:
> +	xa_destroy(&busy_bhs);
> +
> +	return ret;
>  }
>  
>  int try_to_free_buffers(struct page *page)
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 
>
Matthew Wilcox (Oracle) Jan. 27, 2021, 2:59 a.m. UTC | #2
On Tue, Jan 26, 2021 at 02:59:17PM -0800, Minchan Kim wrote:
> The release buffer_head in LRU is great improvement for migration
> point of view.
> 
> A question: 
> 
> Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep or
> elsewhere when migration found the failure and is about to retry?
> 
> Migration has done such a way for other per-cpu stuffs for a long time,
> which would be more consistent with others and might be faster sometimes
> with reducing IPI calls for page.

Should lru_add_drain_all() also handle draining the buffer lru for all
callers?  A quick survey ...

invalidate_bdev() already calls invalidate_bh_lrus()
compact_nodes() would probably benefit from the BH LRU being invalidated
POSIX_FADV_DONTNEED would benefit if the underlying filesystem uses BHs
check_and_migrate_cma_pages() would benefit
khugepaged_do_scan() doesn't need it today
scan_get_next_rmap_item() looks like it only works on anon pages (?) so
	doesn't need it
mem_cgroup_force_empty() probably needs it
mem_cgroup_move_charge() ditto
memfd_wait_for_pins() doesn't need it
shake_page() might benefit
offline_pages() would benefit
alloc_contig_range() would benefit

Seems like most would benefit and a few won't care.  I think I'd lean
towards having lru_add_drain_all() call invalidate_bh_lrus(), just to
simplify things.
Minchan Kim Jan. 27, 2021, 5:01 p.m. UTC | #3
On Wed, Jan 27, 2021 at 02:59:22AM +0000, Matthew Wilcox wrote:
> On Tue, Jan 26, 2021 at 02:59:17PM -0800, Minchan Kim wrote:
> > The release buffer_head in LRU is great improvement for migration
> > point of view.
> > 
> > A question: 
> > 
> > Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep or
> > elsewhere when migration found the failure and is about to retry?
> > 
> > Migration has done such a way for other per-cpu stuffs for a long time,
> > which would be more consistent with others and might be faster sometimes
> > with reducing IPI calls for page.
> 
> Should lru_add_drain_all() also handle draining the buffer lru for all
> callers?  A quick survey ...
> 
> invalidate_bdev() already calls invalidate_bh_lrus()
> compact_nodes() would probably benefit from the BH LRU being invalidated
> POSIX_FADV_DONTNEED would benefit if the underlying filesystem uses BHs
> check_and_migrate_cma_pages() would benefit
> khugepaged_do_scan() doesn't need it today
> scan_get_next_rmap_item() looks like it only works on anon pages (?) so
> 	doesn't need it
> mem_cgroup_force_empty() probably needs it
> mem_cgroup_move_charge() ditto
> memfd_wait_for_pins() doesn't need it
> shake_page() might benefit
> offline_pages() would benefit
> alloc_contig_range() would benefit
> 
> Seems like most would benefit and a few won't care.  I think I'd lean
> towards having lru_add_drain_all() call invalidate_bh_lrus(), just to
> simplify things.

Fair enough.
Chris Goldsworthy Jan. 28, 2021, 8:28 a.m. UTC | #4
On 2021-01-26 18:59, Matthew Wilcox wrote:
> On Tue, Jan 26, 2021 at 02:59:17PM -0800, Minchan Kim wrote:
>> The release buffer_head in LRU is great improvement for migration
>> point of view.
>> 
>> A question:

Hey guys,

>> Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep 
>> or
>> elsewhere when migration found the failure and is about to retry?
>> 
>> Migration has done such a way for other per-cpu stuffs for a long 
>> time,
>> which would be more consistent with others and might be faster 
>> sometimes
>> with reducing IPI calls for page.
> Should lru_add_drain_all() also handle draining the buffer lru for all
> callers?  A quick survey ...
> 
> invalidate_bdev() already calls invalidate_bh_lrus()
> compact_nodes() would probably benefit from the BH LRU being 
> invalidated
> POSIX_FADV_DONTNEED would benefit if the underlying filesystem uses BHs
> check_and_migrate_cma_pages() would benefit
> khugepaged_do_scan() doesn't need it today
> scan_get_next_rmap_item() looks like it only works on anon pages (?) so
> 	doesn't need it
> mem_cgroup_force_empty() probably needs it
> mem_cgroup_move_charge() ditto
> memfd_wait_for_pins() doesn't need it
> shake_page() might benefit
> offline_pages() would benefit
> alloc_contig_range() would benefit
> 
> Seems like most would benefit and a few won't care.  I think I'd lean
> towards having lru_add_drain_all() call invalidate_bh_lrus(), just to
> simplify things.


Doing this sounds like a good idea.  We would still need a call to
invalidate_bh_lrus() inside of drop_buffers() in the event that we find
busy buffers, since they can be re-added back into the BH LRU - I 
believe
it isn't until this point that a BH can't be added back into the BH LRU,
when we acquire the private_lock for the mapping:

https://elixir.bootlin.com/linux/v5.10.10/source/fs/buffer.c#L3240

Thanks,

Chris.
Minchan Kim Jan. 28, 2021, 5:08 p.m. UTC | #5
On Thu, Jan 28, 2021 at 12:28:37AM -0800, Chris Goldsworthy wrote:
> On 2021-01-26 18:59, Matthew Wilcox wrote:
> > On Tue, Jan 26, 2021 at 02:59:17PM -0800, Minchan Kim wrote:
> > > The release buffer_head in LRU is great improvement for migration
> > > point of view.
> > > 
> > > A question:
> 
> Hey guys,
> 
> > > Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep
> > > or
> > > elsewhere when migration found the failure and is about to retry?
> > > 
> > > Migration has done such a way for other per-cpu stuffs for a long
> > > time,
> > > which would be more consistent with others and might be faster
> > > sometimes
> > > with reducing IPI calls for page.
> > Should lru_add_drain_all() also handle draining the buffer lru for all
> > callers?  A quick survey ...
> > 
> > invalidate_bdev() already calls invalidate_bh_lrus()
> > compact_nodes() would probably benefit from the BH LRU being invalidated
> > POSIX_FADV_DONTNEED would benefit if the underlying filesystem uses BHs
> > check_and_migrate_cma_pages() would benefit
> > khugepaged_do_scan() doesn't need it today
> > scan_get_next_rmap_item() looks like it only works on anon pages (?) so
> > 	doesn't need it
> > mem_cgroup_force_empty() probably needs it
> > mem_cgroup_move_charge() ditto
> > memfd_wait_for_pins() doesn't need it
> > shake_page() might benefit
> > offline_pages() would benefit
> > alloc_contig_range() would benefit
> > 
> > Seems like most would benefit and a few won't care.  I think I'd lean
> > towards having lru_add_drain_all() call invalidate_bh_lrus(), just to
> > simplify things.
> 
> 
> Doing this sounds like a good idea.  We would still need a call to
> invalidate_bh_lrus() inside of drop_buffers() in the event that we find
> busy buffers, since they can be re-added back into the BH LRU - I believe
> it isn't until this point that a BH can't be added back into the BH LRU,
> when we acquire the private_lock for the mapping:
> 
> https://elixir.bootlin.com/linux/v5.10.10/source/fs/buffer.c#L3240

I am not sure it's good deal considering IPI overhead per page release
at worst case.

A idea is to disable bh_lrus in migrate_prep and enable it when
migration is done(need to introduce "migrate_done".
It's similar approach with marking pageblock MIGRATE_ISOLATE to
disable pcp during the migration.
Chris Goldsworthy Jan. 28, 2021, 6:43 p.m. UTC | #6
On 2021-01-28 09:08, Minchan Kim wrote:
> On Thu, Jan 28, 2021 at 12:28:37AM -0800, Chris Goldsworthy wrote:
>> On 2021-01-26 18:59, Matthew Wilcox wrote:
>> > On Tue, Jan 26, 2021 at 02:59:17PM -0800, Minchan Kim wrote:
>> > > The release buffer_head in LRU is great improvement for migration
>> > > point of view.
>> > >
>> > > A question:
>> 
>> Hey guys,
>> 
>> > > Can't we invalidate(e.g., invalidate_bh_lrus) bh_lru in migrate_prep
>> > > or
>> > > elsewhere when migration found the failure and is about to retry?
>> > >
>> > > Migration has done such a way for other per-cpu stuffs for a long
>> > > time,
>> > > which would be more consistent with others and might be faster
>> > > sometimes
>> > > with reducing IPI calls for page.
>> > Should lru_add_drain_all() also handle draining the buffer lru for all
>> > callers?  A quick survey ...
>> >
>> > invalidate_bdev() already calls invalidate_bh_lrus()
>> > compact_nodes() would probably benefit from the BH LRU being invalidated
>> > POSIX_FADV_DONTNEED would benefit if the underlying filesystem uses BHs
>> > check_and_migrate_cma_pages() would benefit
>> > khugepaged_do_scan() doesn't need it today
>> > scan_get_next_rmap_item() looks like it only works on anon pages (?) so
>> > 	doesn't need it
>> > mem_cgroup_force_empty() probably needs it
>> > mem_cgroup_move_charge() ditto
>> > memfd_wait_for_pins() doesn't need it
>> > shake_page() might benefit
>> > offline_pages() would benefit
>> > alloc_contig_range() would benefit
>> >
>> > Seems like most would benefit and a few won't care.  I think I'd lean
>> > towards having lru_add_drain_all() call invalidate_bh_lrus(), just to
>> > simplify things.
>> 
>> 
>> Doing this sounds like a good idea.  We would still need a call to
>> invalidate_bh_lrus() inside of drop_buffers() in the event that we 
>> find
>> busy buffers, since they can be re-added back into the BH LRU - I 
>> believe
>> it isn't until this point that a BH can't be added back into the BH 
>> LRU,
>> when we acquire the private_lock for the mapping:
>> 
>> https://elixir.bootlin.com/linux/v5.10.10/source/fs/buffer.c#L3240
> 
> I am not sure it's good deal considering IPI overhead per page release
> at worst case.
> 
> A idea is to disable bh_lrus in migrate_prep and enable it when
> migration is done(need to introduce "migrate_done".
> It's similar approach with marking pageblock MIGRATE_ISOLATE to
> disable pcp during the migration.

I'll try creating that mechanism then for the BH LRU, and will come
back with a patch that does the invalidate in lru_add_drain_all().

Thanks Matthew and Minchan for the feedback!
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index 96c7604..27516a0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -48,6 +48,7 @@ 
 #include <linux/sched/mm.h>
 #include <trace/events/block.h>
 #include <linux/fscrypt.h>
+#include <linux/xarray.h>
 
 #include "internal.h"
 
@@ -1471,12 +1472,55 @@  static bool has_bh_in_lru(int cpu, void *dummy)
 	return false;
 }
 
+static void __evict_bhs_lru(void *arg)
+{
+	struct bh_lru *b = &get_cpu_var(bh_lrus);
+	struct xarray *busy_bhs = arg;
+	struct buffer_head *bh;
+	unsigned long i, xarray_index;
+
+	xa_for_each(busy_bhs, xarray_index, bh) {
+		for (i = 0; i < BH_LRU_SIZE; i++) {
+			if (b->bhs[i] == bh) {
+				brelse(b->bhs[i]);
+				b->bhs[i] = NULL;
+				break;
+			}
+		}
+	}
+
+	put_cpu_var(bh_lrus);
+}
+
+static bool page_has_bhs_in_lru(int cpu, void *arg)
+{
+	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+	struct xarray *busy_bhs = arg;
+	struct buffer_head *bh;
+	unsigned long i, xarray_index;
+
+	xa_for_each(busy_bhs, xarray_index, bh) {
+		for (i = 0; i < BH_LRU_SIZE; i++) {
+			if (b->bhs[i] == bh)
+				return true;
+		}
+	}
+
+	return false;
+
+}
 void invalidate_bh_lrus(void)
 {
 	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
+static void evict_bh_lrus(struct xarray *busy_bhs)
+{
+	on_each_cpu_cond(page_has_bhs_in_lru, __evict_bhs_lru,
+			 busy_bhs, 1);
+}
+
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset)
 {
@@ -3242,14 +3286,38 @@  drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh;
+	struct xarray busy_bhs;
+	int bh_count = 0;
+	int xa_ret, ret = 0;
+
+	xa_init(&busy_bhs);
 
 	bh = head;
 	do {
-		if (buffer_busy(bh))
-			goto failed;
+		if (buffer_busy(bh)) {
+			xa_ret = xa_err(xa_store(&busy_bhs, bh_count++,
+						 bh, GFP_ATOMIC));
+			if (xa_ret)
+				goto out;
+		}
 		bh = bh->b_this_page;
 	} while (bh != head);
 
+	if (bh_count) {
+		/*
+		 * Check if the busy failure was due to an outstanding
+		 * LRU reference
+		 */
+		evict_bh_lrus(&busy_bhs);
+		do {
+			if (buffer_busy(bh))
+				goto out;
+
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+
+	ret = 1;
 	do {
 		struct buffer_head *next = bh->b_this_page;
 
@@ -3259,9 +3327,10 @@  drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 	} while (bh != head);
 	*buffers_to_free = head;
 	detach_page_private(page);
-	return 1;
-failed:
-	return 0;
+out:
+	xa_destroy(&busy_bhs);
+
+	return ret;
 }
 
 int try_to_free_buffers(struct page *page)