diff mbox

[2/3] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

Message ID 150135742076.35318.12884268722541769179.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams July 29, 2017, 7:43 p.m. UTC
>From falloc.h:

    FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
    file logical-to-physical extent offset mappings in the file. The
    purpose is to allow an application to assume that there are no holes
    or shared extents in the file and that the metadata needed to find
    all the physical extents of the file is stable and can never be
    dirtied.

For now this patch only permits setting / clearing the in-memory state
of S_IOMAP_IMMMUTABLE, persisting the state is saved for a later patch.

The implementation is careful to not allow the immutable state to change
while any process might have any established mappings. It reuses the
existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
extents and fill all holes in the file, or otherwise extend the file
size in the same operation that sets S_IOMAP_IMMUTABLE.

Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Suggested-by: Dave Chinner <david@fromorbit.com>
Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/open.c                   |   26 ++++++++++++-
 fs/xfs/xfs_bmap_util.c      |   86 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h      |    2 +
 fs/xfs/xfs_file.c           |   14 +++++--
 include/linux/falloc.h      |    3 +-
 include/uapi/linux/falloc.h |   19 ++++++++++
 6 files changed, 142 insertions(+), 8 deletions(-)

Comments

Darrick J. Wong July 31, 2017, 5:09 p.m. UTC | #1
On Sat, Jul 29, 2017 at 12:43:40PM -0700, Dan Williams wrote:
> >From falloc.h:
> 
>     FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>     file logical-to-physical extent offset mappings in the file. The
>     purpose is to allow an application to assume that there are no holes
>     or shared extents in the file and that the metadata needed to find
>     all the physical extents of the file is stable and can never be
>     dirtied.
> 
> For now this patch only permits setting / clearing the in-memory state
> of S_IOMAP_IMMMUTABLE, persisting the state is saved for a later patch.
> 
> The implementation is careful to not allow the immutable state to change
> while any process might have any established mappings. It reuses the
> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
> extents and fill all holes in the file, or otherwise extend the file
> size in the same operation that sets S_IOMAP_IMMUTABLE.
> 
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Suggested-by: Dave Chinner <david@fromorbit.com>
> Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  fs/open.c                   |   26 ++++++++++++-
>  fs/xfs/xfs_bmap_util.c      |   86 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap_util.h      |    2 +
>  fs/xfs/xfs_file.c           |   14 +++++--
>  include/linux/falloc.h      |    3 +-
>  include/uapi/linux/falloc.h |   19 ++++++++++
>  6 files changed, 142 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 7395860d7164..df075484fad5 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -241,7 +241,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  	struct inode *inode = file_inode(file);
>  	long ret;
>  
> -	if (offset < 0 || len <= 0)
> +	if (offset < 0 || len < 0)
> +		return -EINVAL;
> +
> +	/* Allow zero len only for the unseal operation */
> +	if (!(mode & FALLOC_FL_SEAL_BLOCK_MAP) && len == 0)
>  		return -EINVAL;
>  
>  	/* Return error if mode is not supported */
> @@ -273,6 +277,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>  		return -EINVAL;
>  
> +	/*
> +	 * Seal block map should only be used exclusively, and with
> +	 * the IMMUTABLE capability.
> +	 */
> +	if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> +		if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
> +			return -EINVAL;
> +		if (!capable(CAP_LINUX_IMMUTABLE))
> +			return -EPERM;
> +	}
> +
>  	if (!(file->f_mode & FMODE_WRITE))
>  		return -EBADF;
>  
> @@ -292,9 +307,14 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  		return -ETXTBSY;
>  
>  	/*
> -	 * We cannot allow any allocation changes on an iomap immutable file
> +	 * We cannot allow any allocation changes on an iomap immutable
> +	 * file, however if the operation is FALLOC_FL_SEAL_BLOCK_MAP,
> +	 * call down to ->fallocate() to determine if the operations is
> +	 * allowed. ->fallocate() may either clear the flag when @len is
> +	 * zero, or validate that the requested operation is already the
> +	 * current state of the file.
>  	 */
> -	if (IS_IOMAP_IMMUTABLE(inode))
> +	if (IS_IOMAP_IMMUTABLE(inode) && (!(mode & FALLOC_FL_SEAL_BLOCK_MAP)))
>  		return -ETXTBSY;
>  
>  	/*
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 93e955262d07..c4fc79a0704f 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1387,6 +1387,92 @@ xfs_zero_file_space(
>  
>  }
>  
> +int
> +xfs_seal_file_space(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		len)
> +{
> +	struct inode		*inode = VFS_I(ip);
> +	struct address_space	*mapping = inode->i_mapping;
> +	int			error = 0;
> +
> +	if (offset)
> +		return -EINVAL;
> +
> +	i_mmap_lock_read(mapping);

(Are we allowed to take address_space->i_mmap_rwsem while holding
xfs_inode->i_mmaplock?)

> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	if (len == 0) {
> +		/*
> +		 * Clear the immutable flag provided there are no active
> +		 * mappings. The active mapping check prevents an
> +		 * application that is assuming a static block map, for
> +		 * DAX or peer-to-peer DMA, from having this state
> +		 * silently change behind its back.
> +		 */
> +		if (RB_EMPTY_ROOT(&mapping->i_mmap))
> +			inode->i_flags &= ~S_IOMAP_IMMUTABLE;
> +		else
> +			error = -EBUSY;
> +	} else if (IS_IOMAP_IMMUTABLE(inode)) {
> +		if (len == i_size_read(inode)) {
> +			/*
> +			 * The file is already in the correct state,
> +			 * bail out without error below.
> +			 */
> +			len = 0;
> +		} else {
> +			/* too late to allocate more space */
> +			error = -ETXTBSY;
> +		}
> +	} else {
> +		if (len < i_size_read(inode)) {
> +			/*
> +			 * Since S_IOMAP_IMMUTABLE is inode global it
> +			 * does not make sense to fallocate(immutable)
> +			 * on a sub-range of the file.
> +			 */
> +			error = -EINVAL;
> +		} else if (!RB_EMPTY_ROOT(&mapping->i_mmap)) {
> +			/*
> +			 * It's not strictly required to prevent setting
> +			 * immutable while a file is already mapped, but
> +			 * we do it for simplicity and symmetry with the
> +			 * S_IOMAP_IMMUTABLE disable case.
> +			 */
> +			error = -EBUSY;
> +		} else
> +			inode->i_flags |= S_IOMAP_IMMUTABLE;
> +	}
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	i_mmap_unlock_read(mapping);
> +
> +	if (error || len == 0)
> +		return error;
> +
> +	/*
> +	 * From here, the immutable flag is already set, so new
> +	 * operations that would change the block map are prevented by
> +	 * upper layer code paths. Wwe can proceed to unshare and
> +	 * allocate zeroed / written extents.
> +	 */
> +	error = xfs_reflink_unshare(ip, offset, len);

At this point we still hold the io and mmap locks and the vfs thinks the
inode is iomap_immutable, but we haven't actually fixed the block
mappings, which means that the flag is set but there could be holes and
shared extents aplenty?

That seems strange to me -- wouldn't we want to try to unshare and
allocate, and only then take the ilock, check the mappings, and only set
the flag if nobody's messed with the extent map since the unshare &
allocated?  IOWs,

if (len == 0)
	return xfs_unseal_file_space();

xfs_reflink_unshare(...);
xfs_alloc_file_space(...);

xfs_ilock(...);
if (xfs_iomap_lacks_holes_and_shared_blocks(...)) {
	VFS_I(ip)->i_flags |= S_IOMAP_IMMUTABLE;
	ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
} else {
	error = -EBUSY;
}
xfs_iunlock(...);

(I guess we hold sufficient locks, but still...)

--D

> +	if (error)
> +		goto err;
> +
> +	error = xfs_alloc_file_space(ip, offset, len,
> +			XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> +	if (error)
> +		goto err;
> +
> +	return 0;
> +err:
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	inode->i_flags &= ~S_IOMAP_IMMUTABLE;
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +}
> +
>  /*
>   * @next_fsb will keep track of the extent currently undergoing shift.
>   * @stop_fsb will keep track of the extent at which we have to stop.
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index 0cede1043571..5115a32a2483 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -60,6 +60,8 @@ int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
>  				xfs_off_t len);
>  int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
>  				xfs_off_t len);
> +int	xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
> +				xfs_off_t len);
>  
>  /* EOF block manipulation functions */
>  bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index c4893e226fd8..e21121530a90 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -739,7 +739,8 @@ xfs_file_write_iter(
>  #define	XFS_FALLOC_FL_SUPPORTED						\
>  		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
>  		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
> -		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
> +		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE |	\
> +		 FALLOC_FL_SEAL_BLOCK_MAP)
>  
>  STATIC long
>  xfs_file_fallocate(
> @@ -834,9 +835,14 @@ xfs_file_fallocate(
>  				error = xfs_reflink_unshare(ip, offset, len);
>  				if (error)
>  					goto out_unlock;
> -			}
> -			error = xfs_alloc_file_space(ip, offset, len,
> -						     XFS_BMAPI_PREALLOC);
> +
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
> +			} else if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> +				error = xfs_seal_file_space(ip, offset, len);
> +			} else
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
>  		}
>  		if (error)
>  			goto out_unlock;
> diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> index 7494dc67c66f..48546c6fbec7 100644
> --- a/include/linux/falloc.h
> +++ b/include/linux/falloc.h
> @@ -26,6 +26,7 @@ struct space_resv {
>  					 FALLOC_FL_COLLAPSE_RANGE |	\
>  					 FALLOC_FL_ZERO_RANGE |		\
>  					 FALLOC_FL_INSERT_RANGE |	\
> -					 FALLOC_FL_UNSHARE_RANGE)
> +					 FALLOC_FL_UNSHARE_RANGE |	\
> +					 FALLOC_FL_SEAL_BLOCK_MAP)
>  
>  #endif /* _FALLOC_H_ */
> diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> index b075f601919b..629c9b20e49b 100644
> --- a/include/uapi/linux/falloc.h
> +++ b/include/uapi/linux/falloc.h
> @@ -76,4 +76,23 @@
>   */
>  #define FALLOC_FL_UNSHARE_RANGE		0x40
>  
> +/*
> + * FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
> + * file logical-to-physical extent offset mappings in the file. The
> + * purpose is to allow an application to assume that there are no holes
> + * or shared extents in the file and that the metadata needed to find
> + * all the physical extents of the file is stable and can never be
> + * dirtied.
> + *
> + * The immutable property is in effect for the entire inode, so the
> + * range for this operation must start at offset 0 and len must be
> + * greater than or equal to the current size of the file. If greater,
> + * this operation allocates, unshares, hole fills, and seals in one
> + * atomic step. If len is zero then the immutable state is cleared for
> + * the inode.
> + *
> + * This flag implies FALLOC_FL_UNSHARE_RANGE and as such cannot be used
> + * with the punch, zero, collapse, or insert range modes.
> + */
> +#define FALLOC_FL_SEAL_BLOCK_MAP	0x80
>  #endif /* _UAPI_FALLOC_H_ */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams July 31, 2017, 6:25 p.m. UTC | #2
On Mon, Jul 31, 2017 at 10:09 AM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Sat, Jul 29, 2017 at 12:43:40PM -0700, Dan Williams wrote:
>> >From falloc.h:
>>
>>     FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>>     file logical-to-physical extent offset mappings in the file. The
>>     purpose is to allow an application to assume that there are no holes
>>     or shared extents in the file and that the metadata needed to find
>>     all the physical extents of the file is stable and can never be
>>     dirtied.
>>
>> For now this patch only permits setting / clearing the in-memory state
>> of S_IOMAP_IMMMUTABLE, persisting the state is saved for a later patch.
>>
>> The implementation is careful to not allow the immutable state to change
>> while any process might have any established mappings. It reuses the
>> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
>> extents and fill all holes in the file, or otherwise extend the file
>> size in the same operation that sets S_IOMAP_IMMUTABLE.
>>
>> Cc: Jan Kara <jack@suse.cz>
>> Cc: Jeff Moyer <jmoyer@redhat.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>> Suggested-by: Dave Chinner <david@fromorbit.com>
>> Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  fs/open.c                   |   26 ++++++++++++-
>>  fs/xfs/xfs_bmap_util.c      |   86 +++++++++++++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_bmap_util.h      |    2 +
>>  fs/xfs/xfs_file.c           |   14 +++++--
>>  include/linux/falloc.h      |    3 +-
>>  include/uapi/linux/falloc.h |   19 ++++++++++
>>  6 files changed, 142 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index 7395860d7164..df075484fad5 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -241,7 +241,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>       struct inode *inode = file_inode(file);
>>       long ret;
>>
>> -     if (offset < 0 || len <= 0)
>> +     if (offset < 0 || len < 0)
>> +             return -EINVAL;
>> +
>> +     /* Allow zero len only for the unseal operation */
>> +     if (!(mode & FALLOC_FL_SEAL_BLOCK_MAP) && len == 0)
>>               return -EINVAL;
>>
>>       /* Return error if mode is not supported */
>> @@ -273,6 +277,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>           (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>>               return -EINVAL;
>>
>> +     /*
>> +      * Seal block map should only be used exclusively, and with
>> +      * the IMMUTABLE capability.
>> +      */
>> +     if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
>> +             if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
>> +                     return -EINVAL;
>> +             if (!capable(CAP_LINUX_IMMUTABLE))
>> +                     return -EPERM;
>> +     }
>> +
>>       if (!(file->f_mode & FMODE_WRITE))
>>               return -EBADF;
>>
>> @@ -292,9 +307,14 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>               return -ETXTBSY;
>>
>>       /*
>> -      * We cannot allow any allocation changes on an iomap immutable file
>> +      * We cannot allow any allocation changes on an iomap immutable
>> +      * file, however if the operation is FALLOC_FL_SEAL_BLOCK_MAP,
>> +      * call down to ->fallocate() to determine if the operations is
>> +      * allowed. ->fallocate() may either clear the flag when @len is
>> +      * zero, or validate that the requested operation is already the
>> +      * current state of the file.
>>        */
>> -     if (IS_IOMAP_IMMUTABLE(inode))
>> +     if (IS_IOMAP_IMMUTABLE(inode) && (!(mode & FALLOC_FL_SEAL_BLOCK_MAP)))
>>               return -ETXTBSY;
>>
>>       /*
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index 93e955262d07..c4fc79a0704f 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1387,6 +1387,92 @@ xfs_zero_file_space(
>>
>>  }
>>
>> +int
>> +xfs_seal_file_space(
>> +     struct xfs_inode        *ip,
>> +     xfs_off_t               offset,
>> +     xfs_off_t               len)
>> +{
>> +     struct inode            *inode = VFS_I(ip);
>> +     struct address_space    *mapping = inode->i_mapping;
>> +     int                     error = 0;
>> +
>> +     if (offset)
>> +             return -EINVAL;
>> +
>> +     i_mmap_lock_read(mapping);
>
> (Are we allowed to take address_space->i_mmap_rwsem while holding
> xfs_inode->i_mmaplock?)

Empirically, yes. Lockdep complains when those locks are taken in the
reverse order.

That seems to be inconsistent with the "mmap_sem -> i_mmap_lock ->
page_lock" note in the xfs_ilock comment. Am I confusing what
i_mmap_lock means in that comment, is that the i_mmap_lock_read(), or
the i_mmaplock in the xfs_inode?

>
>> +     xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +     if (len == 0) {
>> +             /*
>> +              * Clear the immutable flag provided there are no active
>> +              * mappings. The active mapping check prevents an
>> +              * application that is assuming a static block map, for
>> +              * DAX or peer-to-peer DMA, from having this state
>> +              * silently change behind its back.
>> +              */
>> +             if (RB_EMPTY_ROOT(&mapping->i_mmap))
>> +                     inode->i_flags &= ~S_IOMAP_IMMUTABLE;
>> +             else
>> +                     error = -EBUSY;
>> +     } else if (IS_IOMAP_IMMUTABLE(inode)) {
>> +             if (len == i_size_read(inode)) {
>> +                     /*
>> +                      * The file is already in the correct state,
>> +                      * bail out without error below.
>> +                      */
>> +                     len = 0;
>> +             } else {
>> +                     /* too late to allocate more space */
>> +                     error = -ETXTBSY;
>> +             }
>> +     } else {
>> +             if (len < i_size_read(inode)) {
>> +                     /*
>> +                      * Since S_IOMAP_IMMUTABLE is inode global it
>> +                      * does not make sense to fallocate(immutable)
>> +                      * on a sub-range of the file.
>> +                      */
>> +                     error = -EINVAL;
>> +             } else if (!RB_EMPTY_ROOT(&mapping->i_mmap)) {
>> +                     /*
>> +                      * It's not strictly required to prevent setting
>> +                      * immutable while a file is already mapped, but
>> +                      * we do it for simplicity and symmetry with the
>> +                      * S_IOMAP_IMMUTABLE disable case.
>> +                      */
>> +                     error = -EBUSY;
>> +             } else
>> +                     inode->i_flags |= S_IOMAP_IMMUTABLE;
>> +     }
>> +     xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +     i_mmap_unlock_read(mapping);
>> +
>> +     if (error || len == 0)
>> +             return error;
>> +
>> +     /*
>> +      * From here, the immutable flag is already set, so new
>> +      * operations that would change the block map are prevented by
>> +      * upper layer code paths. Wwe can proceed to unshare and
>> +      * allocate zeroed / written extents.
>> +      */
>> +     error = xfs_reflink_unshare(ip, offset, len);
>
> At this point we still hold the io and mmap locks and the vfs thinks the
> inode is iomap_immutable, but we haven't actually fixed the block
> mappings, which means that the flag is set but there could be holes and
> shared extents aplenty?
>
> That seems strange to me -- wouldn't we want to try to unshare and
> allocate, and only then take the ilock, check the mappings, and only set
> the flag if nobody's messed with the extent map since the unshare &
> allocated?  IOWs,
>
> if (len == 0)
>         return xfs_unseal_file_space();
>
> xfs_reflink_unshare(...);
> xfs_alloc_file_space(...);
>
> xfs_ilock(...);
> if (xfs_iomap_lacks_holes_and_shared_blocks(...)) {
>         VFS_I(ip)->i_flags |= S_IOMAP_IMMUTABLE;
>         ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
>         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> } else {
>         error = -EBUSY;
> }
> xfs_iunlock(...);
>
> (I guess we hold sufficient locks, but still...)

Yes, that looks safer, especially if other vfs paths make assumptions
about the block map upon seeing that flag set.
Dave Chinner Aug. 1, 2017, 12:30 a.m. UTC | #3
On Mon, Jul 31, 2017 at 11:25:34AM -0700, Dan Williams wrote:
> On Mon, Jul 31, 2017 at 10:09 AM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> >> index 93e955262d07..c4fc79a0704f 100644
> >> --- a/fs/xfs/xfs_bmap_util.c
> >> +++ b/fs/xfs/xfs_bmap_util.c
> >> @@ -1387,6 +1387,92 @@ xfs_zero_file_space(
> >>
> >>  }
> >>
> >> +int
> >> +xfs_seal_file_space(
> >> +     struct xfs_inode        *ip,
> >> +     xfs_off_t               offset,
> >> +     xfs_off_t               len)
> >> +{
> >> +     struct inode            *inode = VFS_I(ip);
> >> +     struct address_space    *mapping = inode->i_mapping;
> >> +     int                     error = 0;
> >> +
> >> +     if (offset)
> >> +             return -EINVAL;
> >> +
> >> +     i_mmap_lock_read(mapping);
> >
> > (Are we allowed to take address_space->i_mmap_rwsem while holding
> > xfs_inode->i_mmaplock?)
> 
> Empirically, yes. Lockdep complains when those locks are taken in the
> reverse order.

My pet hate: people who rely on lockdep to tell them that locking is
wrong rather than understanding what the correct locking order is
before writing code.

> That seems to be inconsistent with the "mmap_sem -> i_mmap_lock ->
> page_lock" note in the xfs_ilock comment. Am I confusing what
> i_mmap_lock means in that comment, is that the i_mmap_lock_read(), or
> the i_mmaplock in the xfs_inode?

The latter. The lock orders you need to pay attention to are
documented in mm/filemap.c. (Which, incidentally, needs updating to
refer to i_rwsem, not i_mutex.)

 *  ->i_mutex
 *    ->i_mmap_rwsem            (truncate->unmap_mapping_range)

 *  ->mmap_sem                                                                   
 *    ->i_mmap_rwsem                                                             
 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)            
 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)

As it is, I think we shold not be taking internal mm/ state locks
deep inside filesystem code as it smells of layering violations. We
don't do this anywhere else for mapping checks - if we already hold
the XFS_MMAPLOCK_EXCL here, then we've already locked out page
faults from changing the state of the inode. In which case, we
should not need a mmap internal lock to be held here, same as all
the other filesystem operations that lock out page faults....

> >> +     xfs_ilock(ip, XFS_ILOCK_EXCL);
> >> +     if (len == 0) {
> >> +             /*
> >> +              * Clear the immutable flag provided there are no active
> >> +              * mappings. The active mapping check prevents an
> >> +              * application that is assuming a static block map, for
> >> +              * DAX or peer-to-peer DMA, from having this state
> >> +              * silently change behind its back.
> >> +              */
> >> +             if (RB_EMPTY_ROOT(&mapping->i_mmap))

			mapping_mapped(mapping)

> >> +                     inode->i_flags &= ~S_IOMAP_IMMUTABLE;
> >> +             else
> >> +                     error = -EBUSY;
> >> +     } else if (IS_IOMAP_IMMUTABLE(inode)) {
> >> +             if (len == i_size_read(inode)) {
> >> +                     /*
> >> +                      * The file is already in the correct state,
> >> +                      * bail out without error below.
> >> +                      */
> >> +                     len = 0;

> >> +             } else {
> >> +                     /* too late to allocate more space */
> >> +                     error = -ETXTBSY;
> >> +             }
> >> +     } else {
> >> +             if (len < i_size_read(inode)) {
> >> +                     /*
> >> +                      * Since S_IOMAP_IMMUTABLE is inode global it
> >> +                      * does not make sense to fallocate(immutable)
> >> +                      * on a sub-range of the file.
> >> +                      */
> >> +                     error = -EINVAL;
> >> +             } else if (!RB_EMPTY_ROOT(&mapping->i_mmap)) {
> >> +                     /*
> >> +                      * It's not strictly required to prevent setting
> >> +                      * immutable while a file is already mapped, but
> >> +                      * we do it for simplicity and symmetry with the
> >> +                      * S_IOMAP_IMMUTABLE disable case.
> >> +                      */
> >> +                     error = -EBUSY;
> >> +             } else
> >> +                     inode->i_flags |= S_IOMAP_IMMUTABLE;
> >> +     }
> >> +     xfs_iunlock(ip, XFS_ILOCK_EXCL);
> >> +     i_mmap_unlock_read(mapping);
> >> +
> >> +     if (error || len == 0)
> >> +             return error;

I have to say, I find this checking to be fairly grotty. The "len ==
0" API to remove the immutable flag is a gross hack.  IMO, it's
better to add a separate fallocate command to "unseal" the extent
map, and let that happen according to whether the file is mapped or
not.  Perhaps it would be better to start with a man page
documenting the desired API?

FWIW, the if/else if/else structure could be cleaned up with a
simple "goto out_unlock" construct such as:

	/* don't make immutable if inode is currently mapped */
	error = -EBUSY;
	if (mapping_mapped(mapping))
		goto out_unlock;

	/* can't do anything if inode is already immutable */
	error = -ETXTBSY;
	if (IS_IMMUTABLE(inode) || IS_IOMAP_IMMUTABLE(inode))
		goto out_unlock;

	/* XFS only supports whole file extent immutability */
	error = -EINVAL;
	if (len != i_size_read(inode))
		goto out_unlock;

	/* all good to go */
	error = 0;

out_unlock:
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	i_mmap_unlock_read(mapping);

	if (error)
	     return error;

	/* now unshare, allocate and add immutable flag */

Cheers,

Dave.
diff mbox

Patch

diff --git a/fs/open.c b/fs/open.c
index 7395860d7164..df075484fad5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -241,7 +241,11 @@  int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	struct inode *inode = file_inode(file);
 	long ret;
 
-	if (offset < 0 || len <= 0)
+	if (offset < 0 || len < 0)
+		return -EINVAL;
+
+	/* Allow zero len only for the unseal operation */
+	if (!(mode & FALLOC_FL_SEAL_BLOCK_MAP) && len == 0)
 		return -EINVAL;
 
 	/* Return error if mode is not supported */
@@ -273,6 +277,17 @@  int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
 		return -EINVAL;
 
+	/*
+	 * Seal block map should only be used exclusively, and with
+	 * the IMMUTABLE capability.
+	 */
+	if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
+		if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
+			return -EINVAL;
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			return -EPERM;
+	}
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
@@ -292,9 +307,14 @@  int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		return -ETXTBSY;
 
 	/*
-	 * We cannot allow any allocation changes on an iomap immutable file
+	 * We cannot allow any allocation changes on an iomap immutable
+	 * file, however if the operation is FALLOC_FL_SEAL_BLOCK_MAP,
+	 * call down to ->fallocate() to determine if the operations is
+	 * allowed. ->fallocate() may either clear the flag when @len is
+	 * zero, or validate that the requested operation is already the
+	 * current state of the file.
 	 */
-	if (IS_IOMAP_IMMUTABLE(inode))
+	if (IS_IOMAP_IMMUTABLE(inode) && (!(mode & FALLOC_FL_SEAL_BLOCK_MAP)))
 		return -ETXTBSY;
 
 	/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 93e955262d07..c4fc79a0704f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1387,6 +1387,92 @@  xfs_zero_file_space(
 
 }
 
+int
+xfs_seal_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct address_space	*mapping = inode->i_mapping;
+	int			error = 0;
+
+	if (offset)
+		return -EINVAL;
+
+	i_mmap_lock_read(mapping);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (len == 0) {
+		/*
+		 * Clear the immutable flag provided there are no active
+		 * mappings. The active mapping check prevents an
+		 * application that is assuming a static block map, for
+		 * DAX or peer-to-peer DMA, from having this state
+		 * silently change behind its back.
+		 */
+		if (RB_EMPTY_ROOT(&mapping->i_mmap))
+			inode->i_flags &= ~S_IOMAP_IMMUTABLE;
+		else
+			error = -EBUSY;
+	} else if (IS_IOMAP_IMMUTABLE(inode)) {
+		if (len == i_size_read(inode)) {
+			/*
+			 * The file is already in the correct state,
+			 * bail out without error below.
+			 */
+			len = 0;
+		} else {
+			/* too late to allocate more space */
+			error = -ETXTBSY;
+		}
+	} else {
+		if (len < i_size_read(inode)) {
+			/*
+			 * Since S_IOMAP_IMMUTABLE is inode global it
+			 * does not make sense to fallocate(immutable)
+			 * on a sub-range of the file.
+			 */
+			error = -EINVAL;
+		} else if (!RB_EMPTY_ROOT(&mapping->i_mmap)) {
+			/*
+			 * It's not strictly required to prevent setting
+			 * immutable while a file is already mapped, but
+			 * we do it for simplicity and symmetry with the
+			 * S_IOMAP_IMMUTABLE disable case.
+			 */
+			error = -EBUSY;
+		} else
+			inode->i_flags |= S_IOMAP_IMMUTABLE;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	i_mmap_unlock_read(mapping);
+
+	if (error || len == 0)
+		return error;
+
+	/*
+	 * From here, the immutable flag is already set, so new
+	 * operations that would change the block map are prevented by
+	 * upper layer code paths. Wwe can proceed to unshare and
+	 * allocate zeroed / written extents.
+	 */
+	error = xfs_reflink_unshare(ip, offset, len);
+	if (error)
+		goto err;
+
+	error = xfs_alloc_file_space(ip, offset, len,
+			XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
+	if (error)
+		goto err;
+
+	return 0;
+err:
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	inode->i_flags &= ~S_IOMAP_IMMUTABLE;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
 /*
  * @next_fsb will keep track of the extent currently undergoing shift.
  * @stop_fsb will keep track of the extent at which we have to stop.
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0cede1043571..5115a32a2483 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -60,6 +60,8 @@  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
 int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
+int	xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..e21121530a90 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -739,7 +739,8 @@  xfs_file_write_iter(
 #define	XFS_FALLOC_FL_SUPPORTED						\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
-		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE |	\
+		 FALLOC_FL_SEAL_BLOCK_MAP)
 
 STATIC long
 xfs_file_fallocate(
@@ -834,9 +835,14 @@  xfs_file_fallocate(
 				error = xfs_reflink_unshare(ip, offset, len);
 				if (error)
 					goto out_unlock;
-			}
-			error = xfs_alloc_file_space(ip, offset, len,
-						     XFS_BMAPI_PREALLOC);
+
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
+			} else if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
+				error = xfs_seal_file_space(ip, offset, len);
+			} else
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
 		}
 		if (error)
 			goto out_unlock;
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 7494dc67c66f..48546c6fbec7 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -26,6 +26,7 @@  struct space_resv {
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
 					 FALLOC_FL_INSERT_RANGE |	\
-					 FALLOC_FL_UNSHARE_RANGE)
+					 FALLOC_FL_UNSHARE_RANGE |	\
+					 FALLOC_FL_SEAL_BLOCK_MAP)
 
 #endif /* _FALLOC_H_ */
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index b075f601919b..629c9b20e49b 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -76,4 +76,23 @@ 
  */
 #define FALLOC_FL_UNSHARE_RANGE		0x40
 
+/*
+ * FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
+ * file logical-to-physical extent offset mappings in the file. The
+ * purpose is to allow an application to assume that there are no holes
+ * or shared extents in the file and that the metadata needed to find
+ * all the physical extents of the file is stable and can never be
+ * dirtied.
+ *
+ * The immutable property is in effect for the entire inode, so the
+ * range for this operation must start at offset 0 and len must be
+ * greater than or equal to the current size of the file. If greater,
+ * this operation allocates, unshares, hole fills, and seals in one
+ * atomic step. If len is zero then the immutable state is cleared for
+ * the inode.
+ *
+ * This flag implies FALLOC_FL_UNSHARE_RANGE and as such cannot be used
+ * with the punch, zero, collapse, or insert range modes.
+ */
+#define FALLOC_FL_SEAL_BLOCK_MAP	0x80
 #endif /* _UAPI_FALLOC_H_ */