diff mbox series

xfs: use current->journal_info for detecting transaction recursion

Message ID 20210222233107.3233795-1-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: use current->journal_info for detecting transaction recursion | expand

Commit Message

Dave Chinner Feb. 22, 2021, 11:31 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
recursion in XFS is just wrong. Remove it from the iomap code and
replace it with XFS specific internal checks using
current->journal_info instead.

Fixes: 9070733b4efa ("xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/iomap/buffered-io.c    |  7 -------
 fs/xfs/libxfs/xfs_btree.c |  4 +++-
 fs/xfs/xfs_aops.c         | 17 +++++++++++++++--
 fs/xfs/xfs_trans.c        | 20 +++++---------------
 fs/xfs/xfs_trans.h        | 30 ++++++++++++++++++++++++++++++
 5 files changed, 53 insertions(+), 25 deletions(-)

Comments

Darrick J. Wong Feb. 23, 2021, 2:15 a.m. UTC | #1
On Tue, Feb 23, 2021 at 10:31:07AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
> recursion in XFS is just wrong. Remove it from the iomap code and
> replace it with XFS specific internal checks using
> current->journal_info instead.

It might be worth mentioning that this changes the PF_MEMALLOC_NOFS
behavior very slightly -- it's now bound to the allocation and freeing
of the transaction, instead of the strange way we used to do this, where
we'd set it at reservation time but we don't /clear/ it at unreserve time.

This doesn't strictly look like a fix patch, but as it is a Dumb
Developer Detector(tm) I could try to push it for 5.12 ... or just make
it one of the first 5.13 patches.  Any preference?

--D

> Fixes: 9070733b4efa ("xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS")
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/iomap/buffered-io.c    |  7 -------
>  fs/xfs/libxfs/xfs_btree.c |  4 +++-
>  fs/xfs/xfs_aops.c         | 17 +++++++++++++++--
>  fs/xfs/xfs_trans.c        | 20 +++++---------------
>  fs/xfs/xfs_trans.h        | 30 ++++++++++++++++++++++++++++++
>  5 files changed, 53 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 16a1e82e3aeb..fcd4a0d71fc1 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1458,13 +1458,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
>  			PF_MEMALLOC))
>  		goto redirty;
>  
> -	/*
> -	 * Given that we do not allow direct reclaim to call us, we should
> -	 * never be called in a recursive filesystem reclaim context.
> -	 */
> -	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
> -		goto redirty;
> -
>  	/*
>  	 * Is this page beyond the end of the file?
>  	 *
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index b56ff451adce..e6a15b920034 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -2805,7 +2805,7 @@ xfs_btree_split_worker(
>  	struct xfs_btree_split_args	*args = container_of(work,
>  						struct xfs_btree_split_args, work);
>  	unsigned long		pflags;
> -	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
> +	unsigned long		new_pflags = 0;
>  
>  	/*
>  	 * we are in a transaction context here, but may also be doing work
> @@ -2817,11 +2817,13 @@ xfs_btree_split_worker(
>  		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
>  
>  	current_set_flags_nested(&pflags, new_pflags);
> +	xfs_trans_set_context(args->cur->bc_tp);
>  
>  	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
>  					 args->key, args->curp, args->stat);
>  	complete(args->done);
>  
> +	xfs_trans_clear_context(args->cur->bc_tp);
>  	current_restore_flags_nested(&pflags, new_pflags);
>  }
>  
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 4304c6416fbb..b4186d666157 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
>  	 * We hand off the transaction to the completion thread now, so
>  	 * clear the flag here.
>  	 */
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> +	xfs_trans_clear_context(tp);
>  	return 0;
>  }
>  
> @@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
>  	 * thus we need to mark ourselves as being in a transaction manually.
>  	 * Similarly for freeze protection.
>  	 */
> -	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> +	xfs_trans_set_context(tp);
>  	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
>  
>  	/* we abort the update if there was an IO error */
> @@ -568,6 +568,12 @@ xfs_vm_writepage(
>  {
>  	struct xfs_writepage_ctx wpc = { };
>  
> +	if (WARN_ON_ONCE(current->journal_info)) {
> +		redirty_page_for_writepage(wbc, page);
> +		unlock_page(page);
> +		return 0;
> +	}
> +
>  	return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
>  }
>  
> @@ -578,6 +584,13 @@ xfs_vm_writepages(
>  {
>  	struct xfs_writepage_ctx wpc = { };
>  
> +	/*
> +	 * Writing back data in a transaction context can result in recursive
> +	 * transactions. This is bad, so issue a warning and get out of here.
> +	 */
> +	if (WARN_ON_ONCE(current->journal_info))
> +		return 0;
> +
>  	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
>  	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
>  }
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 44f72c09c203..e2a922f061c7 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -72,6 +72,7 @@ xfs_trans_free(
>  	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
>  
>  	trace_xfs_trans_free(tp, _RET_IP_);
> +	xfs_trans_clear_context(tp);
>  	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
>  		sb_end_intwrite(tp->t_mountp->m_super);
>  	xfs_trans_free_dqinfo(tp);
> @@ -123,7 +124,8 @@ xfs_trans_dup(
>  
>  	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
>  	tp->t_rtx_res = tp->t_rtx_res_used;
> -	ntp->t_pflags = tp->t_pflags;
> +
> +	xfs_trans_switch_context(tp, ntp);
>  
>  	/* move deferred ops over to the new tp */
>  	xfs_defer_move(ntp, tp);
> @@ -157,9 +159,6 @@ xfs_trans_reserve(
>  	int			error = 0;
>  	bool			rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
>  
> -	/* Mark this thread as being in a transaction */
> -	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> -
>  	/*
>  	 * Attempt to reserve the needed disk blocks by decrementing
>  	 * the number needed from the number available.  This will
> @@ -167,10 +166,8 @@ xfs_trans_reserve(
>  	 */
>  	if (blocks > 0) {
>  		error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
> -		if (error != 0) {
> -			current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> +		if (error != 0)
>  			return -ENOSPC;
> -		}
>  		tp->t_blk_res += blocks;
>  	}
>  
> @@ -244,9 +241,6 @@ xfs_trans_reserve(
>  		xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
>  		tp->t_blk_res = 0;
>  	}
> -
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> -
>  	return error;
>  }
>  
> @@ -270,6 +264,7 @@ xfs_trans_alloc(
>  	tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
>  	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
>  		sb_start_intwrite(mp->m_super);
> +	xfs_trans_set_context(tp);
>  
>  	/*
>  	 * Zero-reservation ("empty") transactions can't modify anything, so
> @@ -893,7 +888,6 @@ __xfs_trans_commit(
>  
>  	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
>  
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
>  	xfs_trans_free(tp);
>  
>  	/*
> @@ -925,7 +919,6 @@ __xfs_trans_commit(
>  			xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
>  		tp->t_ticket = NULL;
>  	}
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
>  	xfs_trans_free_items(tp, !!error);
>  	xfs_trans_free(tp);
>  
> @@ -985,9 +978,6 @@ xfs_trans_cancel(
>  		tp->t_ticket = NULL;
>  	}
>  
> -	/* mark this thread as no longer being in a transaction */
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> -
>  	xfs_trans_free_items(tp, dirty);
>  	xfs_trans_free(tp);
>  }
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index 8b03fbfe9a1b..9dd745cf77c9 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
>  		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
>  		struct xfs_trans **tpp);
>  
> +static inline void
> +xfs_trans_set_context(
> +	struct xfs_trans	*tp)
> +{
> +	ASSERT(current->journal_info == NULL);
> +	tp->t_pflags = memalloc_nofs_save();
> +	current->journal_info = tp;
> +}
> +
> +static inline void
> +xfs_trans_clear_context(
> +	struct xfs_trans	*tp)
> +{
> +	if (current->journal_info == tp) {
> +		memalloc_nofs_restore(tp->t_pflags);
> +		current->journal_info = NULL;
> +	}
> +}
> +
> +static inline void
> +xfs_trans_switch_context(
> +	struct xfs_trans	*old_tp,
> +	struct xfs_trans	*new_tp)
> +{
> +	ASSERT(current->journal_info == old_tp);
> +	new_tp->t_pflags = old_tp->t_pflags;
> +	old_tp->t_pflags = 0;
> +	current->journal_info = new_tp;
> +}
> +
>  #endif	/* __XFS_TRANS_H__ */
> -- 
> 2.28.0
>
Dave Chinner Feb. 23, 2021, 3:28 a.m. UTC | #2
On Mon, Feb 22, 2021 at 06:15:57PM -0800, Darrick J. Wong wrote:
> On Tue, Feb 23, 2021 at 10:31:07AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
> > recursion in XFS is just wrong. Remove it from the iomap code and
> > replace it with XFS specific internal checks using
> > current->journal_info instead.
> 
> It might be worth mentioning that this changes the PF_MEMALLOC_NOFS
> behavior very slightly -- it's now bound to the allocation and freeing
> of the transaction, instead of the strange way we used to do this, where
> we'd set it at reservation time but we don't /clear/ it at unreserve time.

They are effectively the same thing, so I think you are splitting
hairs here. The rule is "transaction context is NOFS" so whether it
is set when the transaction context is entered or a few instructions
later when we start the reservation is not significant.

> This doesn't strictly look like a fix patch, but as it is a Dumb
> Developer Detector(tm) I could try to push it for 5.12 ... or just make
> it one of the first 5.13 patches.  Any preference?

Nope. You're going to need to fix the transaction nesting the new gc
code does before applying this, though, because that is detected as
transaction recursion by this patch....

Cheers,

Dave.
Darrick J. Wong Feb. 23, 2021, 4:51 a.m. UTC | #3
On Tue, Feb 23, 2021 at 02:28:37PM +1100, Dave Chinner wrote:
> On Mon, Feb 22, 2021 at 06:15:57PM -0800, Darrick J. Wong wrote:
> > On Tue, Feb 23, 2021 at 10:31:07AM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
> > > recursion in XFS is just wrong. Remove it from the iomap code and
> > > replace it with XFS specific internal checks using
> > > current->journal_info instead.
> > 
> > It might be worth mentioning that this changes the PF_MEMALLOC_NOFS
> > behavior very slightly -- it's now bound to the allocation and freeing
> > of the transaction, instead of the strange way we used to do this, where
> > we'd set it at reservation time but we don't /clear/ it at unreserve time.
> 
> They are effectively the same thing, so I think you are splitting
> hairs here. The rule is "transaction context is NOFS" so whether it
> is set when the transaction context is entered or a few instructions
> later when we start the reservation is not significant.
> 
> > This doesn't strictly look like a fix patch, but as it is a Dumb
> > Developer Detector(tm) I could try to push it for 5.12 ... or just make
> > it one of the first 5.13 patches.  Any preference?
> 
> Nope. You're going to need to fix the transaction nesting the new gc
> code does before applying this, though, because that is detected as
> transaction recursion by this patch....

Well yes, I was trying to see if I could throw in the fix patch and the
idiot detector, both at the same time... :)

That said, it crashes in xfs/229:

  2822            args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
  2823                                             args->key, args->curp, args->stat);
  2824            complete(args->done);
  2825
> 2826            xfs_trans_clear_context(args->cur->bc_tp);
  2827            current_restore_flags_nested(&pflags, new_pflags);

It's possible for the original wait_for_completion() in
xfs_btree_split() to wake up immediately after complete() drops the
lock.  If it returns (and blows away the stack variable @args) before
the worker resumes, then the worker will be dereferencing freed stack
memory and blows up:

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0 
 Oops: 0000 [#1] PREEMPT SMP
 CPU: 0 PID: 375393 Comm: kworker/0:16 Tainted: G        W         5.11.0-rc4-djw #rc4
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014
 Workqueue: xfsalloc xfs_btree_split_worker [xfs]
 RIP: 0010:xfs_btree_split_worker+0xaa/0x120 [xfs]
 Code: 4b d8 48 8b 53 d0 8b 73 c8 48 8b 7b c0 4c 8b 4b e8 4c 8b 43 e0 e8 46 f5 ff ff 48 8b 7b f8 89 43 f0 e8 0a 84 aa e0 48 8b 43 c0 <48> 8b 00 49 3b 84 24 90 08 00 00 74 20 65 48 8b 14 25 80 6d 01 00
 RSP: 0018:ffffc90003123e60 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffffc9000311b8a8 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: 0000000000000286 RDI: 00000000ffffffff
 RBP: 0000000000000000 R08: ffffc9000311b858 R09: ffff88803ec29f80
 R10: ffff8880054a80d0 R11: ffff88803ec2a030 R12: ffff888005235d00
 R13: 0000000004248060 R14: ffff88800cbe5f00 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 000000000200c002 CR4: 00000000001706b0
 Call Trace:
  process_one_work+0x1dd/0x3b0
  worker_thread+0x57/0x3c0
  ? rescuer_thread+0x3b0/0x3b0
  kthread+0x14f/0x170
  ? __kthread_bind_mask+0x60/0x60
  ret_from_fork+0x1f/0x30
 Modules linked in: xfs dm_thin_pool dm_persistent_data dm_bio_prison dm_snapshot dm_bufio dm_flakey libcrc32c ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 ip_set_hash_ip xt_REDIRECT xt_set xt_tcpudp ip_set_hash_net ip_set_hash_mac ip_set nfnetlink iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6table_filter ip6_tables iptable_filter bfq sch_fq_codel ip_tables x_tables overlay nfsv4 af_packet [last unloaded: xfs]
 Dumping ftrace buffer:
    (ftrace buffer empty)
 CR2: 0000000000000000
 ---[ end trace 1a6aae037bf68618 ]---

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Feb. 23, 2021, 5:53 a.m. UTC | #4
On Mon, Feb 22, 2021 at 08:51:05PM -0800, Darrick J. Wong wrote:
> On Tue, Feb 23, 2021 at 02:28:37PM +1100, Dave Chinner wrote:
> > On Mon, Feb 22, 2021 at 06:15:57PM -0800, Darrick J. Wong wrote:
> > > On Tue, Feb 23, 2021 at 10:31:07AM +1100, Dave Chinner wrote:
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
> > > > recursion in XFS is just wrong. Remove it from the iomap code and
> > > > replace it with XFS specific internal checks using
> > > > current->journal_info instead.
> > > 
> > > It might be worth mentioning that this changes the PF_MEMALLOC_NOFS
> > > behavior very slightly -- it's now bound to the allocation and freeing
> > > of the transaction, instead of the strange way we used to do this, where
> > > we'd set it at reservation time but we don't /clear/ it at unreserve time.
> > 
> > They are effectively the same thing, so I think you are splitting
> > hairs here. The rule is "transaction context is NOFS" so whether it
> > is set when the transaction context is entered or a few instructions
> > later when we start the reservation is not significant.
> > 
> > > This doesn't strictly look like a fix patch, but as it is a Dumb
> > > Developer Detector(tm) I could try to push it for 5.12 ... or just make
> > > it one of the first 5.13 patches.  Any preference?
> > 
> > Nope. You're going to need to fix the transaction nesting the new gc
> > code does before applying this, though, because that is detected as
> > transaction recursion by this patch....
> 
> Well yes, I was trying to see if I could throw in the fix patch and the
> idiot detector, both at the same time... :)
> 
> That said, it crashes in xfs/229:
> 
>   2822            args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
>   2823                                             args->key, args->curp, args->stat);
>   2824            complete(args->done);
>   2825
> > 2826            xfs_trans_clear_context(args->cur->bc_tp);
>   2827            current_restore_flags_nested(&pflags, new_pflags);
> 
> It's possible for the original wait_for_completion() in
> xfs_btree_split() to wake up immediately after complete() drops the
> lock.  If it returns (and blows away the stack variable @args) before
> the worker resumes, then the worker will be dereferencing freed stack
> memory and blows up:

Argh. So I left an undocumented landmine in that code that I then
stepped on myself years later. Easy to fix, just clear the context
before calling done. I'll go re-attach my leg, then update it and
drop a comment in there, too.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..fcd4a0d71fc1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1458,13 +1458,6 @@  iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
 			PF_MEMALLOC))
 		goto redirty;
 
-	/*
-	 * Given that we do not allow direct reclaim to call us, we should
-	 * never be called in a recursive filesystem reclaim context.
-	 */
-	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
-		goto redirty;
-
 	/*
 	 * Is this page beyond the end of the file?
 	 *
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b56ff451adce..e6a15b920034 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2805,7 +2805,7 @@  xfs_btree_split_worker(
 	struct xfs_btree_split_args	*args = container_of(work,
 						struct xfs_btree_split_args, work);
 	unsigned long		pflags;
-	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
+	unsigned long		new_pflags = 0;
 
 	/*
 	 * we are in a transaction context here, but may also be doing work
@@ -2817,11 +2817,13 @@  xfs_btree_split_worker(
 		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 
 	current_set_flags_nested(&pflags, new_pflags);
+	xfs_trans_set_context(args->cur->bc_tp);
 
 	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
 					 args->key, args->curp, args->stat);
 	complete(args->done);
 
+	xfs_trans_clear_context(args->cur->bc_tp);
 	current_restore_flags_nested(&pflags, new_pflags);
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4304c6416fbb..b4186d666157 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@  xfs_setfilesize_trans_alloc(
 	 * We hand off the transaction to the completion thread now, so
 	 * clear the flag here.
 	 */
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+	xfs_trans_clear_context(tp);
 	return 0;
 }
 
@@ -125,7 +125,7 @@  xfs_setfilesize_ioend(
 	 * thus we need to mark ourselves as being in a transaction manually.
 	 * Similarly for freeze protection.
 	 */
-	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+	xfs_trans_set_context(tp);
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	/* we abort the update if there was an IO error */
@@ -568,6 +568,12 @@  xfs_vm_writepage(
 {
 	struct xfs_writepage_ctx wpc = { };
 
+	if (WARN_ON_ONCE(current->journal_info)) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+
 	return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
 
@@ -578,6 +584,13 @@  xfs_vm_writepages(
 {
 	struct xfs_writepage_ctx wpc = { };
 
+	/*
+	 * Writing back data in a transaction context can result in recursive
+	 * transactions. This is bad, so issue a warning and get out of here.
+	 */
+	if (WARN_ON_ONCE(current->journal_info))
+		return 0;
+
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 44f72c09c203..e2a922f061c7 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -72,6 +72,7 @@  xfs_trans_free(
 	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
 	trace_xfs_trans_free(tp, _RET_IP_);
+	xfs_trans_clear_context(tp);
 	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_end_intwrite(tp->t_mountp->m_super);
 	xfs_trans_free_dqinfo(tp);
@@ -123,7 +124,8 @@  xfs_trans_dup(
 
 	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
 	tp->t_rtx_res = tp->t_rtx_res_used;
-	ntp->t_pflags = tp->t_pflags;
+
+	xfs_trans_switch_context(tp, ntp);
 
 	/* move deferred ops over to the new tp */
 	xfs_defer_move(ntp, tp);
@@ -157,9 +159,6 @@  xfs_trans_reserve(
 	int			error = 0;
 	bool			rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
-	/* Mark this thread as being in a transaction */
-	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
 	 * the number needed from the number available.  This will
@@ -167,10 +166,8 @@  xfs_trans_reserve(
 	 */
 	if (blocks > 0) {
 		error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
-		if (error != 0) {
-			current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+		if (error != 0)
 			return -ENOSPC;
-		}
 		tp->t_blk_res += blocks;
 	}
 
@@ -244,9 +241,6 @@  xfs_trans_reserve(
 		xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
 		tp->t_blk_res = 0;
 	}
-
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
 	return error;
 }
 
@@ -270,6 +264,7 @@  xfs_trans_alloc(
 	tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
 	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_start_intwrite(mp->m_super);
+	xfs_trans_set_context(tp);
 
 	/*
 	 * Zero-reservation ("empty") transactions can't modify anything, so
@@ -893,7 +888,6 @@  __xfs_trans_commit(
 
 	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free(tp);
 
 	/*
@@ -925,7 +919,6 @@  __xfs_trans_commit(
 			xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
 		tp->t_ticket = NULL;
 	}
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free_items(tp, !!error);
 	xfs_trans_free(tp);
 
@@ -985,9 +978,6 @@  xfs_trans_cancel(
 		tp->t_ticket = NULL;
 	}
 
-	/* mark this thread as no longer being in a transaction */
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
 	xfs_trans_free_items(tp, dirty);
 	xfs_trans_free(tp);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8b03fbfe9a1b..9dd745cf77c9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -281,4 +281,34 @@  int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
 		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
 		struct xfs_trans **tpp);
 
+static inline void
+xfs_trans_set_context(
+	struct xfs_trans	*tp)
+{
+	ASSERT(current->journal_info == NULL);
+	tp->t_pflags = memalloc_nofs_save();
+	current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+	struct xfs_trans	*tp)
+{
+	if (current->journal_info == tp) {
+		memalloc_nofs_restore(tp->t_pflags);
+		current->journal_info = NULL;
+	}
+}
+
+static inline void
+xfs_trans_switch_context(
+	struct xfs_trans	*old_tp,
+	struct xfs_trans	*new_tp)
+{
+	ASSERT(current->journal_info == old_tp);
+	new_tp->t_pflags = old_tp->t_pflags;
+	old_tp->t_pflags = 0;
+	current->journal_info = new_tp;
+}
+
 #endif	/* __XFS_TRANS_H__ */