diff mbox series

[06/13] xfs: remove XFS_TRANS_NOFS

Message ID 20190627104836.25446-7-hch@lst.de (mailing list archive)
State Accepted
Headers show
Series [01/13] list.h: add list_pop and list_pop_entry helpers | expand

Commit Message

Christoph Hellwig June 27, 2019, 10:48 a.m. UTC
Instead of a magic flag for xfs_trans_alloc, just ensure all callers
that can't relclaim through the file system use memalloc_nofs_save to
set the per-task nofs flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_shared.h |  1 -
 fs/xfs/xfs_aops.c          | 35 ++++++++++++++++++++++-------------
 fs/xfs/xfs_file.c          | 12 +++++++++---
 fs/xfs/xfs_iomap.c         |  2 +-
 fs/xfs/xfs_reflink.c       |  4 ++--
 fs/xfs/xfs_trans.c         |  4 +---
 6 files changed, 35 insertions(+), 23 deletions(-)

Comments

Darrick J. Wong June 27, 2019, 10:30 p.m. UTC | #1
On Thu, Jun 27, 2019 at 12:48:29PM +0200, Christoph Hellwig wrote:
> Instead of a magic flag for xfs_trans_alloc, just ensure all callers
> that can't relclaim through the file system use memalloc_nofs_save to
> set the per-task nofs flag.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_shared.h |  1 -
>  fs/xfs/xfs_aops.c          | 35 ++++++++++++++++++++++-------------
>  fs/xfs/xfs_file.c          | 12 +++++++++---
>  fs/xfs/xfs_iomap.c         |  2 +-
>  fs/xfs/xfs_reflink.c       |  4 ++--
>  fs/xfs/xfs_trans.c         |  4 +---
>  6 files changed, 35 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
> index b9094709bc79..c45acbd3add9 100644
> --- a/fs/xfs/libxfs/xfs_shared.h
> +++ b/fs/xfs/libxfs/xfs_shared.h
> @@ -65,7 +65,6 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
>  #define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
>  #define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
>  #define XFS_TRANS_NO_WRITECOUNT 0x40	/* do not elevate SB writecount */
> -#define XFS_TRANS_NOFS		0x80	/* pass KM_NOFS to kmem_alloc */
>  /*
>   * LOWMODE is used by the allocator to activate the lowspace algorithm - when
>   * free space is running low the extent allocator may choose to allocate an
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 243548b9d0cc..8b3070a40245 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -138,8 +138,7 @@ xfs_setfilesize_trans_alloc(
>  	struct xfs_trans	*tp;
>  	int			error;
>  
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
> -				XFS_TRANS_NOFS, &tp);
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
>  	if (error)
>  		return error;
>  
> @@ -240,8 +239,16 @@ xfs_end_ioend(
>  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
>  	xfs_off_t		offset = ioend->io_offset;
>  	size_t			size = ioend->io_size;
> +	unsigned int		nofs_flag;
>  	int			error;
>  
> +	/*
> +	 * We can do memory allocation here, but aren't in transactional
> +	 * context.  To avoid memory allocation deadlocks set the task-wide
> +	 * nofs context for the following operations.

I think the wording of this is too indirect.  The reason we need to set
NOFS is because we could be doing writeback as part of reclaiming
memory, which means that we cannot recurse back into filesystems to
satisfy the memory allocation needed to create a transaction.  The NOFS
part applies to any memory allocation, of course.

If you're fine with the wording below I'll just edit that into the
patch:

	/*
	 * We can allocate memory here while doing writeback on behalf of
	 * memory reclaim.  To avoid memory allocation deadlocks set the
	 * task-wide nofs context for the following operations.
	 */
	nofs_flag = memalloc_nofs_save();

> +	 */
> +	nofs_flag = memalloc_nofs_save();
> +
>  	/*
>  	 * Just clean up the in-memory strutures if the fs has been shut down.
>  	 */
> @@ -282,6 +289,8 @@ xfs_end_ioend(
>  		list_del_init(&ioend->io_list);
>  		xfs_destroy_ioend(ioend, error);
>  	}
> +
> +	memalloc_nofs_restore(nofs_flag);
>  }
>  
>  /*
> @@ -641,21 +650,19 @@ xfs_submit_ioend(
>  	struct xfs_ioend	*ioend,
>  	int			status)
>  {
> +	unsigned int		nofs_flag;
> +
> +	/*
> +	 * We can do memory allocation here, but aren't in transactional
> +	 * context.  To avoid memory allocation deadlocks set the task-wide
> +	 * nofs context for the following operations.
> +	 */
> +	nofs_flag = memalloc_nofs_save();
> +
>  	/* Convert CoW extents to regular */
>  	if (!status && ioend->io_fork == XFS_COW_FORK) {
> -		/*
> -		 * Yuk. This can do memory allocation, but is not a
> -		 * transactional operation so everything is done in GFP_KERNEL
> -		 * context. That can deadlock, because we hold pages in
> -		 * writeback state and GFP_KERNEL allocations can block on them.
> -		 * Hence we must operate in nofs conditions here.
> -		 */
> -		unsigned nofs_flag;
> -
> -		nofs_flag = memalloc_nofs_save();
>  		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
>  				ioend->io_offset, ioend->io_size);
> -		memalloc_nofs_restore(nofs_flag);
>  	}
>  
>  	/* Reserve log space if we might write beyond the on-disk inode size. */
> @@ -666,6 +673,8 @@ xfs_submit_ioend(
>  	    !ioend->io_append_trans)
>  		status = xfs_setfilesize_trans_alloc(ioend);
>  
> +	memalloc_nofs_restore(nofs_flag);
> +
>  	ioend->io_bio->bi_private = ioend;
>  	ioend->io_bio->bi_end_io = xfs_end_bio;
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 916a35cae5e9..f2d806ef8f06 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -379,6 +379,7 @@ xfs_dio_write_end_io(
>  	struct inode		*inode = file_inode(iocb->ki_filp);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	loff_t			offset = iocb->ki_pos;
> +	unsigned int		nofs_flag;
>  	int			error = 0;
>  
>  	trace_xfs_end_io_direct_write(ip, offset, size);
> @@ -395,10 +396,11 @@ xfs_dio_write_end_io(
>  	 */
>  	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
>  
> +	nofs_flag = memalloc_nofs_save();

Hmm, do we need this here?  I can't remember if there was a need for
setting NOFS under xfs_reflink_end_cow from a dio completion or if that
was purely the buffered writeback case...

--D

>  	if (flags & IOMAP_DIO_COW) {
>  		error = xfs_reflink_end_cow(ip, offset, size);
>  		if (error)
> -			return error;
> +			goto out;
>  	}
>  
>  	/*
> @@ -407,8 +409,10 @@ xfs_dio_write_end_io(
>  	 * earlier allows a racing dio read to find unwritten extents before
>  	 * they are converted.
>  	 */
> -	if (flags & IOMAP_DIO_UNWRITTEN)
> -		return xfs_iomap_write_unwritten(ip, offset, size, true);
> +	if (flags & IOMAP_DIO_UNWRITTEN) {
> +		error = xfs_iomap_write_unwritten(ip, offset, size, true);
> +		goto out;
> +	}
>  
>  	/*
>  	 * We need to update the in-core inode size here so that we don't end up
> @@ -430,6 +434,8 @@ xfs_dio_write_end_io(
>  		spin_unlock(&ip->i_flags_lock);
>  	}
>  
> +out:
> +	memalloc_nofs_restore(nofs_flag);
>  	return error;
>  }
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 6b29452bfba0..461ea023b910 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -782,7 +782,7 @@ xfs_iomap_write_unwritten(
>  		 * complete here and might deadlock on the iolock.
>  		 */
>  		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> -				XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
> +				XFS_TRANS_RESERVE, &tp);
>  		if (error)
>  			return error;
>  
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 680ae7662a78..0b23c2b29609 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -572,7 +572,7 @@ xfs_reflink_cancel_cow_range(
>  
>  	/* Start a rolling transaction to remove the mappings */
>  	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
> -			0, 0, XFS_TRANS_NOFS, &tp);
> +			0, 0, 0, &tp);
>  	if (error)
>  		goto out;
>  
> @@ -631,7 +631,7 @@ xfs_reflink_end_cow_extent(
>  
>  	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
>  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> -			XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
> +			XFS_TRANS_RESERVE, &tp);
>  	if (error)
>  		return error;
>  
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index b026f87608ce..2ad3faa12206 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -264,9 +264,7 @@ xfs_trans_alloc(
>  	 * GFP_NOFS allocation context so that we avoid lockdep false positives
>  	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
>  	 */
> -	tp = kmem_zone_zalloc(xfs_trans_zone,
> -		(flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
> -
> +	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
>  	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
>  		sb_start_intwrite(mp->m_super);
>  
> -- 
> 2.20.1
>
Christoph Hellwig June 28, 2019, 5:37 a.m. UTC | #2
On Thu, Jun 27, 2019 at 03:30:30PM -0700, Darrick J. Wong wrote:
> I think the wording of this is too indirect.  The reason we need to set
> NOFS is because we could be doing writeback as part of reclaiming
> memory, which means that we cannot recurse back into filesystems to
> satisfy the memory allocation needed to create a transaction.  The NOFS
> part applies to any memory allocation, of course.
> 
> If you're fine with the wording below I'll just edit that into the
> patch:
> 
> 	/*
> 	 * We can allocate memory here while doing writeback on behalf of
> 	 * memory reclaim.  To avoid memory allocation deadlocks set the
> 	 * task-wide nofs context for the following operations.
> 	 */
> 	nofs_flag = memalloc_nofs_save();

Fine with me.

> >  	trace_xfs_end_io_direct_write(ip, offset, size);
> > @@ -395,10 +396,11 @@ xfs_dio_write_end_io(
> >  	 */
> >  	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
> >  
> > +	nofs_flag = memalloc_nofs_save();
> 
> Hmm, do we need this here?  I can't remember if there was a need for
> setting NOFS under xfs_reflink_end_cow from a dio completion or if that
> was purely the buffered writeback case...

We certainly had to add it for the unwritten extent conversion, maybe
the corner case just didn't manage to show up for COW yet:


commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb
Author: Christoph Hellwig <hch@infradead.org>
Date:   Mon Oct 19 04:00:03 2009 +0000

    xfs: I/O completion handlers must use NOFS allocations
    
    When completing I/O requests we must not allow the memory allocator to
    recurse into the filesystem, as we might deadlock on waiting for the
    I/O completion otherwise.  The only thing currently allocating normal
    GFP_KERNEL memory is the allocation of the transaction structure for
    the unwritten extent conversion.  Add a memflags argument to
    _xfs_trans_alloc to allow controlling the allocator behaviour.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
    Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
    Reviewed-by: Alex Elder <aelder@sgi.com>
    Signed-off-by: Alex Elder <aelder@sgi.com>

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e6..6f83f58c099f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
 	xfs_inode_t	*ip;
 	int		error;
 
-	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
 	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30a..7294abce6ef2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
 		 * set up a transaction to convert the range of extents
 		 * from unwritten to real. Do allocations in a loop until
 		 * we have covered the range passed in.
+		 *
+		 * Note that we open code the transaction allocation here
+		 * to pass KM_NOFS--we can't risk to recursing back into
+		 * the filesystem here as we might be asked to write out
+		 * the same inode that we complete here and might deadlock
+		 * on the iolock.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
 		tp->t_flags |= XFS_TRANS_RESERVE;
 		error = xfs_trans_reserve(tp, resblks,
 				XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b6c9e807efb..4d509f742bd2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
 	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
 		return 0;
 
-	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
 	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
 					XFS_DEFAULT_LOG_COUNT);
 	if (error) {
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e62..237badcbac3b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
 	uint		type)
 {
 	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
-	return _xfs_trans_alloc(mp, type);
+	return _xfs_trans_alloc(mp, type, KM_SLEEP);
 }
 
 xfs_trans_t *
 _xfs_trans_alloc(
 	xfs_mount_t	*mp,
-	uint		type)
+	uint		type,
+	uint		memflags)
 {
 	xfs_trans_t	*tp;
 
 	atomic_inc(&mp->m_active_trans);
 
-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
 	tp->t_magic = XFS_TRANS_MAGIC;
 	tp->t_type = type;
 	tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ed47fc77759c..a0574f593f52 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -924,7 +924,7 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces.
  */
 xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, uint);
 xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
 int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
 				  uint, uint);
Darrick J. Wong June 28, 2019, 5:41 p.m. UTC | #3
On Fri, Jun 28, 2019 at 07:37:17AM +0200, Christoph Hellwig wrote:
> On Thu, Jun 27, 2019 at 03:30:30PM -0700, Darrick J. Wong wrote:
> > I think the wording of this is too indirect.  The reason we need to set
> > NOFS is because we could be doing writeback as part of reclaiming
> > memory, which means that we cannot recurse back into filesystems to
> > satisfy the memory allocation needed to create a transaction.  The NOFS
> > part applies to any memory allocation, of course.
> > 
> > If you're fine with the wording below I'll just edit that into the
> > patch:
> > 
> > 	/*
> > 	 * We can allocate memory here while doing writeback on behalf of
> > 	 * memory reclaim.  To avoid memory allocation deadlocks set the
> > 	 * task-wide nofs context for the following operations.
> > 	 */
> > 	nofs_flag = memalloc_nofs_save();
> 
> Fine with me.
> 
> > >  	trace_xfs_end_io_direct_write(ip, offset, size);
> > > @@ -395,10 +396,11 @@ xfs_dio_write_end_io(
> > >  	 */
> > >  	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
> > >  
> > > +	nofs_flag = memalloc_nofs_save();
> > 
> > Hmm, do we need this here?  I can't remember if there was a need for
> > setting NOFS under xfs_reflink_end_cow from a dio completion or if that
> > was purely the buffered writeback case...
> 
> We certainly had to add it for the unwritten extent conversion, maybe
> the corner case just didn't manage to show up for COW yet:

AFAICT the referenced patch solved the problem of writeback ioend
completion deadlocking with memory reclaim by changing the transaction
allocation call in the xfs_iomap_write_unwritten function, which is
called by writeback ioend completion.

However, the directio endio code also calls xfs_iomap_write_unwritten.
I can't tell if NOFS is actually needed in that context, or if we've
just been operating like that for a decade because that's the method
that was chosen to solve the deadlock.

I think this boils down to -- does writeback induced by memory reclaim
ever block on directio?

I don't think it does, but as a counterpoint xfs has been like this for
10 years and there don't seem to be many complaints about directio endio
pushing memory reclaim too hard...

--D

> 
> commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb
> Author: Christoph Hellwig <hch@infradead.org>
> Date:   Mon Oct 19 04:00:03 2009 +0000
> 
>     xfs: I/O completion handlers must use NOFS allocations
>     
>     When completing I/O requests we must not allow the memory allocator to
>     recurse into the filesystem, as we might deadlock on waiting for the
>     I/O completion otherwise.  The only thing currently allocating normal
>     GFP_KERNEL memory is the allocation of the transaction structure for
>     the unwritten extent conversion.  Add a memflags argument to
>     _xfs_trans_alloc to allow controlling the allocator behaviour.
>     
>     Signed-off-by: Christoph Hellwig <hch@lst.de>
>     Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
>     Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
>     Reviewed-by: Alex Elder <aelder@sgi.com>
>     Signed-off-by: Alex Elder <aelder@sgi.com>
> 
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 2d0b3e1da9e6..6f83f58c099f 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -611,7 +611,7 @@ xfs_fs_log_dummy(
>  	xfs_inode_t	*ip;
>  	int		error;
>  
> -	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
> +	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
>  	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
>  	if (error) {
>  		xfs_trans_cancel(tp, 0);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 67ae5555a30a..7294abce6ef2 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
>  		 * set up a transaction to convert the range of extents
>  		 * from unwritten to real. Do allocations in a loop until
>  		 * we have covered the range passed in.
> +		 *
> +		 * Note that we open code the transaction allocation here
> +		 * to pass KM_NOFS--we can't risk to recursing back into
> +		 * the filesystem here as we might be asked to write out
> +		 * the same inode that we complete here and might deadlock
> +		 * on the iolock.
>  		 */
> -		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
> +		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
> +		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
>  		tp->t_flags |= XFS_TRANS_RESERVE;
>  		error = xfs_trans_reserve(tp, resblks,
>  				XFS_WRITE_LOG_RES(mp), 0,
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 8b6c9e807efb..4d509f742bd2 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1471,7 +1471,7 @@ xfs_log_sbcount(
>  	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
>  		return 0;
>  
> -	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
> +	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
>  	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
>  					XFS_DEFAULT_LOG_COUNT);
>  	if (error) {
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 66b849358e62..237badcbac3b 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -236,19 +236,20 @@ xfs_trans_alloc(
>  	uint		type)
>  {
>  	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
> -	return _xfs_trans_alloc(mp, type);
> +	return _xfs_trans_alloc(mp, type, KM_SLEEP);
>  }
>  
>  xfs_trans_t *
>  _xfs_trans_alloc(
>  	xfs_mount_t	*mp,
> -	uint		type)
> +	uint		type,
> +	uint		memflags)
>  {
>  	xfs_trans_t	*tp;
>  
>  	atomic_inc(&mp->m_active_trans);
>  
> -	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
> +	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
>  	tp->t_magic = XFS_TRANS_MAGIC;
>  	tp->t_type = type;
>  	tp->t_mountp = mp;
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index ed47fc77759c..a0574f593f52 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -924,7 +924,7 @@ typedef struct xfs_trans {
>   * XFS transaction mechanism exported interfaces.
>   */
>  xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
> -xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
> +xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, uint);
>  xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
>  int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
>  				  uint, uint);
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index b9094709bc79..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -65,7 +65,6 @@  void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
 #define XFS_TRANS_NO_WRITECOUNT 0x40	/* do not elevate SB writecount */
-#define XFS_TRANS_NOFS		0x80	/* pass KM_NOFS to kmem_alloc */
 /*
  * LOWMODE is used by the allocator to activate the lowspace algorithm - when
  * free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 243548b9d0cc..8b3070a40245 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -138,8 +138,7 @@  xfs_setfilesize_trans_alloc(
 	struct xfs_trans	*tp;
 	int			error;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
-				XFS_TRANS_NOFS, &tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 	if (error)
 		return error;
 
@@ -240,8 +239,16 @@  xfs_end_ioend(
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
+	unsigned int		nofs_flag;
 	int			error;
 
+	/*
+	 * We can do memory allocation here, but aren't in transactional
+	 * context.  To avoid memory allocation deadlocks set the task-wide
+	 * nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
 	/*
 	 * Just clean up the in-memory strutures if the fs has been shut down.
 	 */
@@ -282,6 +289,8 @@  xfs_end_ioend(
 		list_del_init(&ioend->io_list);
 		xfs_destroy_ioend(ioend, error);
 	}
+
+	memalloc_nofs_restore(nofs_flag);
 }
 
 /*
@@ -641,21 +650,19 @@  xfs_submit_ioend(
 	struct xfs_ioend	*ioend,
 	int			status)
 {
+	unsigned int		nofs_flag;
+
+	/*
+	 * We can do memory allocation here, but aren't in transactional
+	 * context.  To avoid memory allocation deadlocks set the task-wide
+	 * nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
 	/* Convert CoW extents to regular */
 	if (!status && ioend->io_fork == XFS_COW_FORK) {
-		/*
-		 * Yuk. This can do memory allocation, but is not a
-		 * transactional operation so everything is done in GFP_KERNEL
-		 * context. That can deadlock, because we hold pages in
-		 * writeback state and GFP_KERNEL allocations can block on them.
-		 * Hence we must operate in nofs conditions here.
-		 */
-		unsigned nofs_flag;
-
-		nofs_flag = memalloc_nofs_save();
 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 				ioend->io_offset, ioend->io_size);
-		memalloc_nofs_restore(nofs_flag);
 	}
 
 	/* Reserve log space if we might write beyond the on-disk inode size. */
@@ -666,6 +673,8 @@  xfs_submit_ioend(
 	    !ioend->io_append_trans)
 		status = xfs_setfilesize_trans_alloc(ioend);
 
+	memalloc_nofs_restore(nofs_flag);
+
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 916a35cae5e9..f2d806ef8f06 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -379,6 +379,7 @@  xfs_dio_write_end_io(
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			offset = iocb->ki_pos;
+	unsigned int		nofs_flag;
 	int			error = 0;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
@@ -395,10 +396,11 @@  xfs_dio_write_end_io(
 	 */
 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
 
+	nofs_flag = memalloc_nofs_save();
 	if (flags & IOMAP_DIO_COW) {
 		error = xfs_reflink_end_cow(ip, offset, size);
 		if (error)
-			return error;
+			goto out;
 	}
 
 	/*
@@ -407,8 +409,10 @@  xfs_dio_write_end_io(
 	 * earlier allows a racing dio read to find unwritten extents before
 	 * they are converted.
 	 */
-	if (flags & IOMAP_DIO_UNWRITTEN)
-		return xfs_iomap_write_unwritten(ip, offset, size, true);
+	if (flags & IOMAP_DIO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, offset, size, true);
+		goto out;
+	}
 
 	/*
 	 * We need to update the in-core inode size here so that we don't end up
@@ -430,6 +434,8 @@  xfs_dio_write_end_io(
 		spin_unlock(&ip->i_flags_lock);
 	}
 
+out:
+	memalloc_nofs_restore(nofs_flag);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6b29452bfba0..461ea023b910 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -782,7 +782,7 @@  xfs_iomap_write_unwritten(
 		 * complete here and might deadlock on the iolock.
 		 */
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
-				XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+				XFS_TRANS_RESERVE, &tp);
 		if (error)
 			return error;
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 680ae7662a78..0b23c2b29609 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -572,7 +572,7 @@  xfs_reflink_cancel_cow_range(
 
 	/* Start a rolling transaction to remove the mappings */
 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
-			0, 0, XFS_TRANS_NOFS, &tp);
+			0, 0, 0, &tp);
 	if (error)
 		goto out;
 
@@ -631,7 +631,7 @@  xfs_reflink_end_cow_extent(
 
 	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
-			XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+			XFS_TRANS_RESERVE, &tp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index b026f87608ce..2ad3faa12206 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -264,9 +264,7 @@  xfs_trans_alloc(
 	 * GFP_NOFS allocation context so that we avoid lockdep false positives
 	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
 	 */
-	tp = kmem_zone_zalloc(xfs_trans_zone,
-		(flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
-
+	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
 	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_start_intwrite(mp->m_super);