diff mbox series

[RFC,1/1] fs: jbd2: try to launch cp transaction when bh refer cma

Message ID 20240820072609.570513-1-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series [RFC,1/1] fs: jbd2: try to launch cp transaction when bh refer cma | expand

Commit Message

zhaoyang.huang Aug. 20, 2024, 7:26 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

cma_alloc() keep failed when an bunch of IO operations happened on an
journal enabled ext4 device which is caused by a jh->bh->b_page
can not be migrated out of CMA area as the jh has one cp_transaction
pending on it. We solve this by launching jbd2_log_do_checkpoint forcefully
somewhere. Since journal is common mechanism to all JFSs and
cp_transaction has a little fewer opportunity to be launched, this patch
would like to introduce a timing point at which the
cp_transaction->t_checkpoint_list is shrunk if CMA page used for
journalling.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 fs/jbd2/checkpoint.c | 27 +++++++++++++++++++++++++++
 fs/jbd2/journal.c    |  4 ++++
 include/linux/jbd2.h |  2 ++
 3 files changed, 33 insertions(+)

Comments

Jan Kara Sept. 11, 2024, 10:50 a.m. UTC | #1
On Tue 20-08-24 15:26:09, zhaoyang.huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> cma_alloc() keep failed when an bunch of IO operations happened on an
> journal enabled ext4 device which is caused by a jh->bh->b_page
> can not be migrated out of CMA area as the jh has one cp_transaction
> pending on it. We solve this by launching jbd2_log_do_checkpoint forcefully
> somewhere. Since journal is common mechanism to all JFSs and
> cp_transaction has a little fewer opportunity to be launched, this patch
> would like to introduce a timing point at which the
> cp_transaction->t_checkpoint_list is shrunk if CMA page used for
> journalling.
> 
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Hum, I see your problem but this solution feels like a hack. How are dirty
metadata buffers in CMA region different from any other dirty page cache
folio? ... checking migration code ... Oh, right, normal dirty page cache
folio will get migrated while bdev mappings use
buffer_migrate_folio_norefs() so we cannot migrate as long as jh is
attached to the folio. OK, I'd think that providing proper page migration
function that can migrate buffers on checkpoint list would be a cleaner
(and more efficient) way to go. buffer_migrate_folio() is safe for jbd2
buffers that are only part of checkpoint list. The trouble is that we need
to check that the jh is not part of running or committing transaction under
the buffer lock so for that we'd need to hook into __buffer_migrate_folio()
and I'm not yet clear on a way how to cleanly do that...

								Honza

> ---
>  fs/jbd2/checkpoint.c | 27 +++++++++++++++++++++++++++
>  fs/jbd2/journal.c    |  4 ++++
>  include/linux/jbd2.h |  2 ++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
> index 951f78634adf..8c6c1dba1f0f 100644
> --- a/fs/jbd2/checkpoint.c
> +++ b/fs/jbd2/checkpoint.c
> @@ -21,6 +21,7 @@
>  #include <linux/slab.h>
>  #include <linux/blkdev.h>
>  #include <trace/events/jbd2.h>
> +#include <linux/mm.h>
>  
>  /*
>   * Unlink a buffer from a transaction checkpoint list.
> @@ -137,6 +138,32 @@ __flush_batch(journal_t *journal, int *batch_count)
>  	*batch_count = 0;
>  }
>  
> +#ifdef CONFIG_CMA
> +void drain_cma_bh(journal_t *journal)
> +{
> +	struct journal_head	*jh;
> +	struct buffer_head	*bh;
> +
> +	if (!journal->j_checkpoint_transactions)
> +		return;
> +
> +	jh = journal->j_checkpoint_transactions->t_checkpoint_list;
> +	while (jh) {
> +		bh = jh2bh(jh);
> +
> +		if (bh && get_pageblock_migratetype(bh->b_page) == MIGRATE_CMA) {
> +			mutex_lock_io(&journal->j_checkpoint_mutex);
> +			jbd2_log_do_checkpoint(journal);
> +			mutex_unlock(&journal->j_checkpoint_mutex);
> +			return;
> +		}
> +
> +		jh = jh->b_cpnext;
> +	}
> +}
> +#else
> +void drain_cma_bh(journal_t *journal) {}
> +#endif
>  /*
>   * Perform an actual checkpoint. We take the first transaction on the
>   * list of transactions to be checkpointed and send all its buffers
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 1ebf2393bfb7..dd92cb7404fc 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -41,6 +41,7 @@
>  #include <linux/bitops.h>
>  #include <linux/ratelimit.h>
>  #include <linux/sched/mm.h>
> +#include <linux/swap.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/jbd2.h>
> @@ -1273,6 +1274,9 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
>  	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
>  	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
>  
> +	if (current_is_kswapd())
> +		drain_cma_bh(journal);
> +
>  	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
>  
>  	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 5157d92b6f23..fc152382a6ae 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -105,6 +105,8 @@ typedef struct jbd2_journal_handle handle_t;	/* Atomic operation type */
>  typedef struct journal_s	journal_t;	/* Journal control structure */
>  #endif
>  
> +void drain_cma_bh(journal_t *journal);
> +
>  /*
>   * Internal structures used by the logging mechanism:
>   */
> -- 
> 2.25.1
>
diff mbox series

Patch

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 951f78634adf..8c6c1dba1f0f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -21,6 +21,7 @@ 
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
+#include <linux/mm.h>
 
 /*
  * Unlink a buffer from a transaction checkpoint list.
@@ -137,6 +138,32 @@  __flush_batch(journal_t *journal, int *batch_count)
 	*batch_count = 0;
 }
 
+#ifdef CONFIG_CMA
+void drain_cma_bh(journal_t *journal)
+{
+	struct journal_head	*jh;
+	struct buffer_head	*bh;
+
+	if (!journal->j_checkpoint_transactions)
+		return;
+
+	jh = journal->j_checkpoint_transactions->t_checkpoint_list;
+	while (jh) {
+		bh = jh2bh(jh);
+
+		if (bh && get_pageblock_migratetype(bh->b_page) == MIGRATE_CMA) {
+			mutex_lock_io(&journal->j_checkpoint_mutex);
+			jbd2_log_do_checkpoint(journal);
+			mutex_unlock(&journal->j_checkpoint_mutex);
+			return;
+		}
+
+		jh = jh->b_cpnext;
+	}
+}
+#else
+void drain_cma_bh(journal_t *journal) {}
+#endif
 /*
  * Perform an actual checkpoint. We take the first transaction on the
  * list of transactions to be checkpointed and send all its buffers
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1ebf2393bfb7..dd92cb7404fc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@ 
 #include <linux/bitops.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
+#include <linux/swap.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -1273,6 +1274,9 @@  static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
 	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
 	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
 
+	if (current_is_kswapd())
+		drain_cma_bh(journal);
+
 	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
 
 	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5157d92b6f23..fc152382a6ae 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -105,6 +105,8 @@  typedef struct jbd2_journal_handle handle_t;	/* Atomic operation type */
 typedef struct journal_s	journal_t;	/* Journal control structure */
 #endif
 
+void drain_cma_bh(journal_t *journal);
+
 /*
  * Internal structures used by the logging mechanism:
  */