diff mbox

[v2,2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

Message ID 150181369754.32119.7390149918955188408.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams Aug. 4, 2017, 2:28 a.m. UTC
>From falloc.h:

    FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
    file logical-to-physical extent offset mappings in the file. The
    purpose is to allow an application to assume that there are no holes
    or shared extents in the file and that the metadata needed to find
    all the physical extents of the file is stable and can never be
    dirtied.

For now this patch only permits setting the in-memory state of
S_IOMAP_IMMMUTABLE. Support for clearing and persisting the state is
saved for later patches.

The implementation is careful to not allow the immutable state to change
while any process might have any established mappings. It reuses the
existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
extents and fill all holes in the file. It then holds XFS_ILOCK_EXCL
while it validates the file is in the proper state and sets
S_IOMAP_IMMUTABLE.

Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Suggested-by: Dave Chinner <david@fromorbit.com>
Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/open.c                   |   11 +++++
 fs/xfs/xfs_bmap_util.c      |  101 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h      |    2 +
 fs/xfs/xfs_file.c           |   14 ++++--
 include/linux/falloc.h      |    3 +
 include/uapi/linux/falloc.h |   19 ++++++++
 6 files changed, 145 insertions(+), 5 deletions(-)

Comments

Darrick J. Wong Aug. 4, 2017, 7:46 p.m. UTC | #1
On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> >From falloc.h:
> 
>     FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>     file logical-to-physical extent offset mappings in the file. The
>     purpose is to allow an application to assume that there are no holes
>     or shared extents in the file and that the metadata needed to find
>     all the physical extents of the file is stable and can never be
>     dirtied.
> 
> For now this patch only permits setting the in-memory state of
> S_IOMAP_IMMMUTABLE. Support for clearing and persisting the state is
> saved for later patches.
> 
> The implementation is careful to not allow the immutable state to change
> while any process might have any established mappings. It reuses the
> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
> extents and fill all holes in the file. It then holds XFS_ILOCK_EXCL
> while it validates the file is in the proper state and sets
> S_IOMAP_IMMUTABLE.
> 
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Suggested-by: Dave Chinner <david@fromorbit.com>
> Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  fs/open.c                   |   11 +++++
>  fs/xfs/xfs_bmap_util.c      |  101 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_bmap_util.h      |    2 +
>  fs/xfs/xfs_file.c           |   14 ++++--
>  include/linux/falloc.h      |    3 +
>  include/uapi/linux/falloc.h |   19 ++++++++
>  6 files changed, 145 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 7395860d7164..e3aae59785ae 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -273,6 +273,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>  		return -EINVAL;
>  
> +	/*
> +	 * Seal block map operation should only be used exclusively, and
> +	 * with the IMMUTABLE capability.
> +	 */
> +	if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> +		if (!capable(CAP_LINUX_IMMUTABLE))
> +			return -EPERM;
> +		if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
> +			return -EINVAL;
> +	}
> +
>  	if (!(file->f_mode & FMODE_WRITE))
>  		return -EBADF;
>  
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>  
>  }
>  
> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> +STATIC int
> +xfs_file_has_holes(
> +	struct xfs_inode	*ip)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_bmbt_irec	*map;
> +	const int		map_size = 10;	/* constrain memory overhead */
> +	int			i, nmaps;
> +	int			error = 0;
> +	xfs_fileoff_t		lblkno = 0;
> +	xfs_filblks_t		maxlblkcnt;
> +
> +	map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);

Sleeping with an inode fully locked and (eventually) a running
transaction?  Yikes.

Just allocate one xfs_bmbt_irec on the stack and pass in nmaps=1.

This method might fit better in libxfs/xfs_bmap.c where it'll be
able to scan the extent list more quickly with the iext helpers.

> +
> +	maxlblkcnt = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
> +	do {
> +		nmaps = map_size;
> +		error = xfs_bmapi_read(ip, lblkno, maxlblkcnt - lblkno,
> +				       map, &nmaps, 0);
> +		if (error)
> +			break;
> +
> +		ASSERT(nmaps <= map_size);
> +		for (i = 0; i < nmaps; i++) {
> +			lblkno += map[i].br_blockcount;
> +			if (map[i].br_startblock == HOLESTARTBLOCK) {

I think we also need to check for unwritten extents here, because a
write to an unwritten block requires some zeroing and a mapping metdata
update.

> +				error = 1;
> +				break;
> +			}
> +		}
> +	} while (nmaps > 0 && error == 0);
> +
> +	kmem_free(map);
> +	return error;
> +}
> +
> +int
> +xfs_seal_file_space(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		len)
> +{
> +	struct inode		*inode = VFS_I(ip);
> +	struct address_space	*mapping = inode->i_mapping;
> +	int			error;
> +
> +	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));

The IOLOCK must be held here too.

> +
> +	if (offset)
> +		return -EINVAL;
> +
> +	error = xfs_reflink_unshare(ip, offset, len);
> +	if (error)
> +		return error;
> +
> +	error = xfs_alloc_file_space(ip, offset, len,
> +			XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> +	if (error)
> +		return error;
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	/*
> +	 * Either the size changed after we performed allocation /
> +	 * unsharing, or the request was too small to begin with.
> +	 */
> +	error = -EINVAL;
> +	if (len < i_size_read(inode))
> +		goto out_unlock;
> +
> +	/*
> +	 * Allow DAX path to assume that the state of S_IOMAP_IMMUTABLE
> +	 * will never change while any mapping is established.
> +	 */
> +	error = -EBUSY;
> +	if (mapping_mapped(mapping))
> +		goto out_unlock;
> +
> +	/* Did we race someone attempting to share extents? */
> +	if (xfs_is_reflink_inode(ip))
> +		goto out_unlock;
> +
> +	/* Did we race a hole punch? */
> +	error = xfs_file_has_holes(ip);
> +	if (error == 1) {
> +		error = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	/* Abort on an error reading the block map */
> +	if (error < 0)
> +		goto out_unlock;
> +
> +	inode->i_flags |= S_IOMAP_IMMUTABLE;
> +
> +out_unlock:
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +	return error;
> +}
> +
>  /*
>   * @next_fsb will keep track of the extent currently undergoing shift.
>   * @stop_fsb will keep track of the extent at which we have to stop.
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index 0cede1043571..5115a32a2483 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -60,6 +60,8 @@ int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
>  				xfs_off_t len);
>  int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
>  				xfs_off_t len);
> +int	xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
> +				xfs_off_t len);
>  
>  /* EOF block manipulation functions */
>  bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index c4893e226fd8..e21121530a90 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -739,7 +739,8 @@ xfs_file_write_iter(
>  #define	XFS_FALLOC_FL_SUPPORTED						\
>  		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
>  		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
> -		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
> +		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE |	\
> +		 FALLOC_FL_SEAL_BLOCK_MAP)
>  
>  STATIC long
>  xfs_file_fallocate(
> @@ -834,9 +835,14 @@ xfs_file_fallocate(
>  				error = xfs_reflink_unshare(ip, offset, len);
>  				if (error)
>  					goto out_unlock;
> -			}
> -			error = xfs_alloc_file_space(ip, offset, len,
> -						     XFS_BMAPI_PREALLOC);
> +
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
> +			} else if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> +				error = xfs_seal_file_space(ip, offset, len);
> +			} else
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
>  		}
>  		if (error)
>  			goto out_unlock;
> diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> index 7494dc67c66f..48546c6fbec7 100644
> --- a/include/linux/falloc.h
> +++ b/include/linux/falloc.h
> @@ -26,6 +26,7 @@ struct space_resv {
>  					 FALLOC_FL_COLLAPSE_RANGE |	\
>  					 FALLOC_FL_ZERO_RANGE |		\
>  					 FALLOC_FL_INSERT_RANGE |	\
> -					 FALLOC_FL_UNSHARE_RANGE)
> +					 FALLOC_FL_UNSHARE_RANGE |	\
> +					 FALLOC_FL_SEAL_BLOCK_MAP)
>  
>  #endif /* _FALLOC_H_ */
> diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
> index b075f601919b..39076975bf6f 100644
> --- a/include/uapi/linux/falloc.h
> +++ b/include/uapi/linux/falloc.h
> @@ -76,4 +76,23 @@
>   */
>  #define FALLOC_FL_UNSHARE_RANGE		0x40
>  
> +/*
> + * FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
> + * file logical-to-physical extent offset mappings in the file. The
> + * purpose is to allow an application to assume that there are no holes
> + * or shared extents in the file and that the metadata needed to find
> + * all the physical extents of the file is stable and can never be
> + * dirtied.
> + *
> + * The immutable property is in effect for the entire inode, so the
> + * range for this operation must start at offset 0 and len must be
> + * greater than or equal to the current size of the file. If greater,
> + * this operation allocates, unshares, hole fills, and seals in one

'allocates' is the same as 'hole fills', I think.

This converts unwritten extents to zeroed written extents too, correct?

> + * atomic step. If len is zero then the immutable state is cleared for
> + * the inode.

It's cleared if len == 0?  I thought that was what FL_UNSEAL is for?

> + * This flag implies FALLOC_FL_UNSHARE_RANGE and as such cannot be used
> + * with the punch, zero, collapse, or insert range modes.
> + */
> +#define FALLOC_FL_SEAL_BLOCK_MAP	0x080
>  #endif /* _UAPI_FALLOC_H_ */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Aug. 4, 2017, 7:52 p.m. UTC | #2
On Fri, Aug 4, 2017 at 12:46 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
>> >From falloc.h:
>>
>>     FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>>     file logical-to-physical extent offset mappings in the file. The
>>     purpose is to allow an application to assume that there are no holes
>>     or shared extents in the file and that the metadata needed to find
>>     all the physical extents of the file is stable and can never be
>>     dirtied.
>>
>> For now this patch only permits setting the in-memory state of
>> S_IOMAP_IMMMUTABLE. Support for clearing and persisting the state is
>> saved for later patches.
>>
>> The implementation is careful to not allow the immutable state to change
>> while any process might have any established mappings. It reuses the
>> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
>> extents and fill all holes in the file. It then holds XFS_ILOCK_EXCL
>> while it validates the file is in the proper state and sets
>> S_IOMAP_IMMUTABLE.
>>
>> Cc: Jan Kara <jack@suse.cz>
>> Cc: Jeff Moyer <jmoyer@redhat.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>> Suggested-by: Dave Chinner <david@fromorbit.com>
>> Suggested-by: "Darrick J. Wong" <darrick.wong@oracle.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  fs/open.c                   |   11 +++++
>>  fs/xfs/xfs_bmap_util.c      |  101 +++++++++++++++++++++++++++++++++++++++++++
>>  fs/xfs/xfs_bmap_util.h      |    2 +
>>  fs/xfs/xfs_file.c           |   14 ++++--
>>  include/linux/falloc.h      |    3 +
>>  include/uapi/linux/falloc.h |   19 ++++++++
>>  6 files changed, 145 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index 7395860d7164..e3aae59785ae 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -273,6 +273,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>           (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>>               return -EINVAL;
>>
>> +     /*
>> +      * Seal block map operation should only be used exclusively, and
>> +      * with the IMMUTABLE capability.
>> +      */
>> +     if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
>> +             if (!capable(CAP_LINUX_IMMUTABLE))
>> +                     return -EPERM;
>> +             if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
>> +                     return -EINVAL;
>> +     }
>> +
>>       if (!(file->f_mode & FMODE_WRITE))
>>               return -EBADF;
>>
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index fe0f8f7f4bb7..46d8eb9e19fc 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>>
>>  }
>>
>> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
>> +STATIC int
>> +xfs_file_has_holes(
>> +     struct xfs_inode        *ip)
>> +{
>> +     struct xfs_mount        *mp = ip->i_mount;
>> +     struct xfs_bmbt_irec    *map;
>> +     const int               map_size = 10;  /* constrain memory overhead */
>> +     int                     i, nmaps;
>> +     int                     error = 0;
>> +     xfs_fileoff_t           lblkno = 0;
>> +     xfs_filblks_t           maxlblkcnt;
>> +
>> +     map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
>
> Sleeping with an inode fully locked and (eventually) a running
> transaction?  Yikes.
>
> Just allocate one xfs_bmbt_irec on the stack and pass in nmaps=1.
>
> This method might fit better in libxfs/xfs_bmap.c where it'll be
> able to scan the extent list more quickly with the iext helpers.

Ok, I'll take a look.

>
>> +
>> +     maxlblkcnt = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
>> +     do {
>> +             nmaps = map_size;
>> +             error = xfs_bmapi_read(ip, lblkno, maxlblkcnt - lblkno,
>> +                                    map, &nmaps, 0);
>> +             if (error)
>> +                     break;
>> +
>> +             ASSERT(nmaps <= map_size);
>> +             for (i = 0; i < nmaps; i++) {
>> +                     lblkno += map[i].br_blockcount;
>> +                     if (map[i].br_startblock == HOLESTARTBLOCK) {
>
> I think we also need to check for unwritten extents here, because a
> write to an unwritten block requires some zeroing and a mapping metdata
> update.

Will do.

>
>> +                             error = 1;
>> +                             break;
>> +                     }
>> +             }
>> +     } while (nmaps > 0 && error == 0);
>> +
>> +     kmem_free(map);
>> +     return error;
>> +}
>> +
>> +int
>> +xfs_seal_file_space(
>> +     struct xfs_inode        *ip,
>> +     xfs_off_t               offset,
>> +     xfs_off_t               len)
>> +{
>> +     struct inode            *inode = VFS_I(ip);
>> +     struct address_space    *mapping = inode->i_mapping;
>> +     int                     error;
>> +
>> +     ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>
> The IOLOCK must be held here too.

Ok, I can add that. I had this here for the mapping_mapped() check.

>
>> +
>> +     if (offset)
>> +             return -EINVAL;
>> +
>> +     error = xfs_reflink_unshare(ip, offset, len);
>> +     if (error)
>> +             return error;
>> +
>> +     error = xfs_alloc_file_space(ip, offset, len,
>> +                     XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
>> +     if (error)
>> +             return error;
>> +
>> +     xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +     /*
>> +      * Either the size changed after we performed allocation /
>> +      * unsharing, or the request was too small to begin with.
>> +      */
>> +     error = -EINVAL;
>> +     if (len < i_size_read(inode))
>> +             goto out_unlock;
>> +
>> +     /*
>> +      * Allow DAX path to assume that the state of S_IOMAP_IMMUTABLE
>> +      * will never change while any mapping is established.
>> +      */
>> +     error = -EBUSY;
>> +     if (mapping_mapped(mapping))
>> +             goto out_unlock;
>> +
>> +     /* Did we race someone attempting to share extents? */
>> +     if (xfs_is_reflink_inode(ip))
>> +             goto out_unlock;
>> +
>> +     /* Did we race a hole punch? */
>> +     error = xfs_file_has_holes(ip);
>> +     if (error == 1) {
>> +             error = -EBUSY;
>> +             goto out_unlock;
>> +     }
>> +
>> +     /* Abort on an error reading the block map */
>> +     if (error < 0)
>> +             goto out_unlock;
>> +
>> +     inode->i_flags |= S_IOMAP_IMMUTABLE;
>> +
>> +out_unlock:
>> +     xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +
>> +     return error;
>> +}
>> +
>>  /*
>>   * @next_fsb will keep track of the extent currently undergoing shift.
>>   * @stop_fsb will keep track of the extent at which we have to stop.
>> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
>> index 0cede1043571..5115a32a2483 100644
>> --- a/fs/xfs/xfs_bmap_util.h
>> +++ b/fs/xfs/xfs_bmap_util.h
>> @@ -60,6 +60,8 @@ int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
>>                               xfs_off_t len);
>>  int  xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
>>                               xfs_off_t len);
>> +int  xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
>> +                             xfs_off_t len);
>>
>>  /* EOF block manipulation functions */
>>  bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index c4893e226fd8..e21121530a90 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -739,7 +739,8 @@ xfs_file_write_iter(
>>  #define      XFS_FALLOC_FL_SUPPORTED                                         \
>>               (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
>>                FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
>> -              FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
>> +              FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE |     \
>> +              FALLOC_FL_SEAL_BLOCK_MAP)
>>
>>  STATIC long
>>  xfs_file_fallocate(
>> @@ -834,9 +835,14 @@ xfs_file_fallocate(
>>                               error = xfs_reflink_unshare(ip, offset, len);
>>                               if (error)
>>                                       goto out_unlock;
>> -                     }
>> -                     error = xfs_alloc_file_space(ip, offset, len,
>> -                                                  XFS_BMAPI_PREALLOC);
>> +
>> +                             error = xfs_alloc_file_space(ip, offset, len,
>> +                                             XFS_BMAPI_PREALLOC);
>> +                     } else if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
>> +                             error = xfs_seal_file_space(ip, offset, len);
>> +                     } else
>> +                             error = xfs_alloc_file_space(ip, offset, len,
>> +                                             XFS_BMAPI_PREALLOC);
>>               }
>>               if (error)
>>                       goto out_unlock;
>> diff --git a/include/linux/falloc.h b/include/linux/falloc.h
>> index 7494dc67c66f..48546c6fbec7 100644
>> --- a/include/linux/falloc.h
>> +++ b/include/linux/falloc.h
>> @@ -26,6 +26,7 @@ struct space_resv {
>>                                        FALLOC_FL_COLLAPSE_RANGE |     \
>>                                        FALLOC_FL_ZERO_RANGE |         \
>>                                        FALLOC_FL_INSERT_RANGE |       \
>> -                                      FALLOC_FL_UNSHARE_RANGE)
>> +                                      FALLOC_FL_UNSHARE_RANGE |      \
>> +                                      FALLOC_FL_SEAL_BLOCK_MAP)
>>
>>  #endif /* _FALLOC_H_ */
>> diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
>> index b075f601919b..39076975bf6f 100644
>> --- a/include/uapi/linux/falloc.h
>> +++ b/include/uapi/linux/falloc.h
>> @@ -76,4 +76,23 @@
>>   */
>>  #define FALLOC_FL_UNSHARE_RANGE              0x40
>>
>> +/*
>> + * FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>> + * file logical-to-physical extent offset mappings in the file. The
>> + * purpose is to allow an application to assume that there are no holes
>> + * or shared extents in the file and that the metadata needed to find
>> + * all the physical extents of the file is stable and can never be
>> + * dirtied.
>> + *
>> + * The immutable property is in effect for the entire inode, so the
>> + * range for this operation must start at offset 0 and len must be
>> + * greater than or equal to the current size of the file. If greater,
>> + * this operation allocates, unshares, hole fills, and seals in one
>
> 'allocates' is the same as 'hole fills', I think.
>
> This converts unwritten extents to zeroed written extents too, correct?

Yes, I'll add that and also include that in the man page patch that
I'm working on...

>
>> + * atomic step. If len is zero then the immutable state is cleared for
>> + * the inode.
>
> It's cleared if len == 0?  I thought that was what FL_UNSEAL is for?

Whoops, stale holdover from v1.

>> + * This flag implies FALLOC_FL_UNSHARE_RANGE and as such cannot be used
>> + * with the punch, zero, collapse, or insert range modes.
>> + */
>> +#define FALLOC_FL_SEAL_BLOCK_MAP     0x080
>>  #endif /* _UAPI_FALLOC_H_ */

Thanks Darrick!
Dave Chinner Aug. 4, 2017, 11:31 p.m. UTC | #3
On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>  
>  }
>  
> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> +STATIC int
> +xfs_file_has_holes(
> +	struct xfs_inode	*ip)
> +{

Why do we need this function?

We've just run xfs_alloc_file_space() across the entire range we
are sealing, so we've already guaranteed that it won't have holes
in it.

Cheers,

Dave.
Dan Williams Aug. 4, 2017, 11:43 p.m. UTC | #4
On Fri, Aug 4, 2017 at 4:31 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index fe0f8f7f4bb7..46d8eb9e19fc 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>>
>>  }
>>
>> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
>> +STATIC int
>> +xfs_file_has_holes(
>> +     struct xfs_inode        *ip)
>> +{
>
> Why do we need this function?
>
> We've just run xfs_alloc_file_space() across the entire range we
> are sealing, so we've already guaranteed that it won't have holes
> in it.

I'm sure this is due to my ignorance of the scope of XFS_IOLOCK_EXCL
vs XFS_ILOCK_EXCL. I had assumed that since we drop and retake
XFS_ILOCK_EXCL that we need to re-validate the block map before
setting S_IOMAP_IMMUTABLE.
Dave Chinner Aug. 5, 2017, 12:04 a.m. UTC | #5
On Fri, Aug 04, 2017 at 04:43:50PM -0700, Dan Williams wrote:
> On Fri, Aug 4, 2017 at 4:31 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> >> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> >> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> >> --- a/fs/xfs/xfs_bmap_util.c
> >> +++ b/fs/xfs/xfs_bmap_util.c
> >> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
> >>
> >>  }
> >>
> >> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> >> +STATIC int
> >> +xfs_file_has_holes(
> >> +     struct xfs_inode        *ip)
> >> +{
> >
> > Why do we need this function?
> >
> > We've just run xfs_alloc_file_space() across the entire range we
> > are sealing, so we've already guaranteed that it won't have holes
> > in it.
> 
> I'm sure this is due to my ignorance of the scope of XFS_IOLOCK_EXCL
> vs XFS_ILOCK_EXCL. I had assumed that since we drop and retake
> XFS_ILOCK_EXCL that we need to re-validate the block map before
> setting S_IOMAP_IMMUTABLE.

THe ILOCK is there to protect the inode metadata when there is
concurrent access through the IO/MMAP lock paths.  However, if we
hold the IOLOCK_EXCL and the MMAPLOCK_EXCL, then nothing can get
through the IO interfaces to modify the data in the file.  This is
required because APIs that directly modify the extent map (e.g.
fallocate, truncate, etc) have to lock out the IO path to ensure
there are no IOs in flight across the range we are manipulating.

Holding these locks also locks out other APIs that modify the extent
map and so effectively nothing else can be accessing or modifying
the extent map while a fallocate or truncate operation is in
progress.

Cheers,

Dave.
diff mbox

Patch

diff --git a/fs/open.c b/fs/open.c
index 7395860d7164..e3aae59785ae 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -273,6 +273,17 @@  int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
 		return -EINVAL;
 
+	/*
+	 * Seal block map operation should only be used exclusively, and
+	 * with the IMMUTABLE capability.
+	 */
+	if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			return -EPERM;
+		if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
+			return -EINVAL;
+	}
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fe0f8f7f4bb7..46d8eb9e19fc 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1393,6 +1393,107 @@  xfs_zero_file_space(
 
 }
 
+/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
+STATIC int
+xfs_file_has_holes(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	*map;
+	const int		map_size = 10;	/* constrain memory overhead */
+	int			i, nmaps;
+	int			error = 0;
+	xfs_fileoff_t		lblkno = 0;
+	xfs_filblks_t		maxlblkcnt;
+
+	map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+
+	maxlblkcnt = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
+	do {
+		nmaps = map_size;
+		error = xfs_bmapi_read(ip, lblkno, maxlblkcnt - lblkno,
+				       map, &nmaps, 0);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= map_size);
+		for (i = 0; i < nmaps; i++) {
+			lblkno += map[i].br_blockcount;
+			if (map[i].br_startblock == HOLESTARTBLOCK) {
+				error = 1;
+				break;
+			}
+		}
+	} while (nmaps > 0 && error == 0);
+
+	kmem_free(map);
+	return error;
+}
+
+int
+xfs_seal_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct address_space	*mapping = inode->i_mapping;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+	if (offset)
+		return -EINVAL;
+
+	error = xfs_reflink_unshare(ip, offset, len);
+	if (error)
+		return error;
+
+	error = xfs_alloc_file_space(ip, offset, len,
+			XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * Either the size changed after we performed allocation /
+	 * unsharing, or the request was too small to begin with.
+	 */
+	error = -EINVAL;
+	if (len < i_size_read(inode))
+		goto out_unlock;
+
+	/*
+	 * Allow DAX path to assume that the state of S_IOMAP_IMMUTABLE
+	 * will never change while any mapping is established.
+	 */
+	error = -EBUSY;
+	if (mapping_mapped(mapping))
+		goto out_unlock;
+
+	/* Did we race someone attempting to share extents? */
+	if (xfs_is_reflink_inode(ip))
+		goto out_unlock;
+
+	/* Did we race a hole punch? */
+	error = xfs_file_has_holes(ip);
+	if (error == 1) {
+		error = -EBUSY;
+		goto out_unlock;
+	}
+
+	/* Abort on an error reading the block map */
+	if (error < 0)
+		goto out_unlock;
+
+	inode->i_flags |= S_IOMAP_IMMUTABLE;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
 /*
  * @next_fsb will keep track of the extent currently undergoing shift.
  * @stop_fsb will keep track of the extent at which we have to stop.
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0cede1043571..5115a32a2483 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -60,6 +60,8 @@  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
 int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
+int	xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
+				xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..e21121530a90 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -739,7 +739,8 @@  xfs_file_write_iter(
 #define	XFS_FALLOC_FL_SUPPORTED						\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
-		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE |	\
+		 FALLOC_FL_SEAL_BLOCK_MAP)
 
 STATIC long
 xfs_file_fallocate(
@@ -834,9 +835,14 @@  xfs_file_fallocate(
 				error = xfs_reflink_unshare(ip, offset, len);
 				if (error)
 					goto out_unlock;
-			}
-			error = xfs_alloc_file_space(ip, offset, len,
-						     XFS_BMAPI_PREALLOC);
+
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
+			} else if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
+				error = xfs_seal_file_space(ip, offset, len);
+			} else
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
 		}
 		if (error)
 			goto out_unlock;
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 7494dc67c66f..48546c6fbec7 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -26,6 +26,7 @@  struct space_resv {
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
 					 FALLOC_FL_INSERT_RANGE |	\
-					 FALLOC_FL_UNSHARE_RANGE)
+					 FALLOC_FL_UNSHARE_RANGE |	\
+					 FALLOC_FL_SEAL_BLOCK_MAP)
 
 #endif /* _FALLOC_H_ */
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index b075f601919b..39076975bf6f 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -76,4 +76,23 @@ 
  */
 #define FALLOC_FL_UNSHARE_RANGE		0x40
 
+/*
+ * FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
+ * file logical-to-physical extent offset mappings in the file. The
+ * purpose is to allow an application to assume that there are no holes
+ * or shared extents in the file and that the metadata needed to find
+ * all the physical extents of the file is stable and can never be
+ * dirtied.
+ *
+ * The immutable property is in effect for the entire inode, so the
+ * range for this operation must start at offset 0 and len must be
+ * greater than or equal to the current size of the file. If greater,
+ * this operation allocates, unshares, hole fills, and seals in one
+ * atomic step. If len is zero then the immutable state is cleared for
+ * the inode.
+ *
+ * This flag implies FALLOC_FL_UNSHARE_RANGE and as such cannot be used
+ * with the punch, zero, collapse, or insert range modes.
+ */
+#define FALLOC_FL_SEAL_BLOCK_MAP	0x080
 #endif /* _UAPI_FALLOC_H_ */