diff mbox series

[2/5] iomap: use accelerated zeroing on a block device to zero a file range

Message ID 163192865577.417973.11122330974455662098.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series vfs: enable userspace to reset damaged file storage | expand

Commit Message

Darrick J. Wong Sept. 18, 2021, 1:30 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Create a function that ensures that the storage backing part of a file
contains zeroes and will not trip over old media errors if the contents
are re-read.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/direct-io.c  |   75 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |    3 ++
 2 files changed, 78 insertions(+)

Comments

Ritesh Harjani Sept. 18, 2021, 4:55 p.m. UTC | #1
On 21/09/17 06:30PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Create a function that ensures that the storage backing part of a file
> contains zeroes and will not trip over old media errors if the contents
> are re-read.
>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/iomap/direct-io.c  |   75 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h |    3 ++
>  2 files changed, 78 insertions(+)
>
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 4ecd255e0511..48826a49f976 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	return iomap_dio_complete(dio);
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static loff_t
> +iomap_zeroinit_iter(struct iomap_iter *iter)
> +{
> +	struct iomap *iomap = &iter->iomap;
> +	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> +	const u64 start = iomap->addr + iter->pos - iomap->offset;
> +	const u64 nr_bytes = iomap_length(iter);
> +	sector_t sector = start >> SECTOR_SHIFT;
> +	sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT;
> +	int ret;
> +
> +	if (!iomap->bdev)
> +		return -ECANCELED;
> +
> +	/* The physical extent must be sector-aligned for block layer APIs. */
> +	if ((start | nr_bytes) & (SECTOR_SIZE - 1))
> +		return -EINVAL;
> +
> +	/* Must be able to zero storage directly without fs intervention. */
> +	if (iomap->flags & IOMAP_F_SHARED)
> +		return -ECANCELED;
> +	if (srcmap != iomap)
> +		return -ECANCELED;
> +
> +	switch (iomap->type) {
> +	case IOMAP_MAPPED:
> +		ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors,
> +				GFP_KERNEL, 0);
> +		if (ret)
> +			return ret;
> +		fallthrough;
> +	case IOMAP_UNWRITTEN:
> +		return nr_bytes;
> +	}
> +
> +	/* Reject holes, inline data, or delalloc extents. */
> +	return -ECANCELED;

Same comment here as in patch-1 which implements dax_zeroinit_iter().

-ritesh

> +}
> +
> +/*
> + * Use a storage device's accelerated zero-writing command to ensure the media
> + * are ready to accept read and write commands.  FSDAX is not supported.
> + *
> + * The range arguments must be aligned to sector size.  The file must be backed
> + * by a block device.  The extents returned must not require copy on write (or
> + * any other mapping interventions from the filesystem) and must be contiguous.
> + * @done will be set to true if the reset succeeded.
> + *
> + * Returns 0 if the zero initialization succeeded, -ECANCELED if the storage
> + * mappings do not support zero initialization, -EOPNOTSUPP if the device does
> + * not support it, or the usual negative errno.
> + */
> +int
> +iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len,
> +		   const struct iomap_ops *ops)
> +{
> +	struct iomap_iter iter = {
> +		.inode		= inode,
> +		.pos		= pos,
> +		.len		= len,
> +		.flags		= IOMAP_REPORT,
> +	};
> +	int ret;
> +
> +	if (IS_DAX(inode))
> +		return -EINVAL;
> +	if (pos + len > i_size_read(inode))
> +		return -EINVAL;
> +
> +	while ((ret = iomap_iter(&iter, ops)) > 0)
> +		iter.processed = iomap_zeroinit_iter(&iter);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iomap_zeroout_range);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 24f8489583ca..f4b9c6698388 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -339,6 +339,9 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  ssize_t iomap_dio_complete(struct iomap_dio *dio);
>  int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
>
> +int iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len,
> +		const struct iomap_ops *ops);
> +
>  #ifdef CONFIG_SWAP
>  struct file;
>  struct swap_info_struct;
>
Christoph Hellwig Sept. 21, 2021, 8:29 a.m. UTC | #2
On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create a function that ensures that the storage backing part of a file
> contains zeroes and will not trip over old media errors if the contents
> are re-read.

I don't think this has anything to do with direct I/O, so I'd rather
not have it clutter direct-io.c.  Also do we really want to wait
synchronously for every bio instead of batching them up?  Especially
as a simple bio_chain is probably all that is needed.
Dave Chinner Sept. 21, 2021, 10:33 p.m. UTC | #3
On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create a function that ensures that the storage backing part of a file
> contains zeroes and will not trip over old media errors if the contents
> are re-read.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/iomap/direct-io.c  |   75 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h |    3 ++
>  2 files changed, 78 insertions(+)
> 
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 4ecd255e0511..48826a49f976 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	return iomap_dio_complete(dio);
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static loff_t
> +iomap_zeroinit_iter(struct iomap_iter *iter)
> +{
> +	struct iomap *iomap = &iter->iomap;
> +	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> +	const u64 start = iomap->addr + iter->pos - iomap->offset;
> +	const u64 nr_bytes = iomap_length(iter);
> +	sector_t sector = start >> SECTOR_SHIFT;
> +	sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT;
> +	int ret;
> +
> +	if (!iomap->bdev)
> +		return -ECANCELED;
> +
> +	/* The physical extent must be sector-aligned for block layer APIs. */
> +	if ((start | nr_bytes) & (SECTOR_SIZE - 1))
> +		return -EINVAL;
> +
> +	/* Must be able to zero storage directly without fs intervention. */
> +	if (iomap->flags & IOMAP_F_SHARED)
> +		return -ECANCELED;
> +	if (srcmap != iomap)
> +		return -ECANCELED;
> +
> +	switch (iomap->type) {
> +	case IOMAP_MAPPED:
> +		ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors,
> +				GFP_KERNEL, 0);

Pretty sure this needs to use BLKDEV_ZERO_NOUNMAP.

The whole point of this is having zeroed space allocated ready for
write on return, so having the hardware optimise away the physical
storage zeroing by punching a hole in it's backing store and then
potentially getting ENOSPC on the next write to this range would be
.... suboptimal.

Cheers,

Dave.
Darrick J. Wong Sept. 22, 2021, 6:53 p.m. UTC | #4
On Tue, Sep 21, 2021 at 09:29:25AM +0100, Christoph Hellwig wrote:
> On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Create a function that ensures that the storage backing part of a file
> > contains zeroes and will not trip over old media errors if the contents
> > are re-read.
> 
> I don't think this has anything to do with direct I/O, so I'd rather
> not have it clutter direct-io.c.  Also do we really want to wait
> synchronously for every bio instead of batching them up?  Especially
> as a simple bio_chain is probably all that is needed.

__blkdev_issue_zeroout looks appropriate for chaining.  I'll move the
zeroout routine into a new lowlevel.c file, since this isn't buffered io
either.

--D
Darrick J. Wong Sept. 22, 2021, 6:54 p.m. UTC | #5
On Wed, Sep 22, 2021 at 08:33:43AM +1000, Dave Chinner wrote:
> On Fri, Sep 17, 2021 at 06:30:55PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Create a function that ensures that the storage backing part of a file
> > contains zeroes and will not trip over old media errors if the contents
> > are re-read.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/iomap/direct-io.c  |   75 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/iomap.h |    3 ++
> >  2 files changed, 78 insertions(+)
> > 
> > 
> > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > index 4ecd255e0511..48826a49f976 100644
> > --- a/fs/iomap/direct-io.c
> > +++ b/fs/iomap/direct-io.c
> > @@ -652,3 +652,78 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> >  	return iomap_dio_complete(dio);
> >  }
> >  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> > +
> > +static loff_t
> > +iomap_zeroinit_iter(struct iomap_iter *iter)
> > +{
> > +	struct iomap *iomap = &iter->iomap;
> > +	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > +	const u64 start = iomap->addr + iter->pos - iomap->offset;
> > +	const u64 nr_bytes = iomap_length(iter);
> > +	sector_t sector = start >> SECTOR_SHIFT;
> > +	sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT;
> > +	int ret;
> > +
> > +	if (!iomap->bdev)
> > +		return -ECANCELED;
> > +
> > +	/* The physical extent must be sector-aligned for block layer APIs. */
> > +	if ((start | nr_bytes) & (SECTOR_SIZE - 1))
> > +		return -EINVAL;
> > +
> > +	/* Must be able to zero storage directly without fs intervention. */
> > +	if (iomap->flags & IOMAP_F_SHARED)
> > +		return -ECANCELED;
> > +	if (srcmap != iomap)
> > +		return -ECANCELED;
> > +
> > +	switch (iomap->type) {
> > +	case IOMAP_MAPPED:
> > +		ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors,
> > +				GFP_KERNEL, 0);
> 
> Pretty sure this needs to use BLKDEV_ZERO_NOUNMAP.

Oops, good catch!

--D

> The whole point of this is having zeroed space allocated ready for
> write on return, so having the hardware optimise away the physical
> storage zeroing by punching a hole in it's backing store and then
> potentially getting ENOSPC on the next write to this range would be
> .... suboptimal.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..48826a49f976 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -652,3 +652,78 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	return iomap_dio_complete(dio);
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static loff_t
+iomap_zeroinit_iter(struct iomap_iter *iter)
+{
+	struct iomap *iomap = &iter->iomap;
+	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+	const u64 start = iomap->addr + iter->pos - iomap->offset;
+	const u64 nr_bytes = iomap_length(iter);
+	sector_t sector = start >> SECTOR_SHIFT;
+	sector_t nr_sectors = nr_bytes >> SECTOR_SHIFT;
+	int ret;
+
+	if (!iomap->bdev)
+		return -ECANCELED;
+
+	/* The physical extent must be sector-aligned for block layer APIs. */
+	if ((start | nr_bytes) & (SECTOR_SIZE - 1))
+		return -EINVAL;
+
+	/* Must be able to zero storage directly without fs intervention. */
+	if (iomap->flags & IOMAP_F_SHARED)
+		return -ECANCELED;
+	if (srcmap != iomap)
+		return -ECANCELED;
+
+	switch (iomap->type) {
+	case IOMAP_MAPPED:
+		ret = blkdev_issue_zeroout(iomap->bdev, sector, nr_sectors,
+				GFP_KERNEL, 0);
+		if (ret)
+			return ret;
+		fallthrough;
+	case IOMAP_UNWRITTEN:
+		return nr_bytes;
+	}
+
+	/* Reject holes, inline data, or delalloc extents. */
+	return -ECANCELED;
+}
+
+/*
+ * Use a storage device's accelerated zero-writing command to ensure the media
+ * are ready to accept read and write commands.  FSDAX is not supported.
+ *
+ * The range arguments must be aligned to sector size.  The file must be backed
+ * by a block device.  The extents returned must not require copy on write (or
+ * any other mapping interventions from the filesystem) and must be contiguous.
+ * @done will be set to true if the reset succeeded.
+ *
+ * Returns 0 if the zero initialization succeeded, -ECANCELED if the storage
+ * mappings do not support zero initialization, -EOPNOTSUPP if the device does
+ * not support it, or the usual negative errno.
+ */
+int
+iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len,
+		   const struct iomap_ops *ops)
+{
+	struct iomap_iter iter = {
+		.inode		= inode,
+		.pos		= pos,
+		.len		= len,
+		.flags		= IOMAP_REPORT,
+	};
+	int ret;
+
+	if (IS_DAX(inode))
+		return -EINVAL;
+	if (pos + len > i_size_read(inode))
+		return -EINVAL;
+
+	while ((ret = iomap_iter(&iter, ops)) > 0)
+		iter.processed = iomap_zeroinit_iter(&iter);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_zeroout_range);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 24f8489583ca..f4b9c6698388 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -339,6 +339,9 @@  struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
 int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
+int iomap_zeroout_range(struct inode *inode, loff_t pos, u64 len,
+		const struct iomap_ops *ops);
+
 #ifdef CONFIG_SWAP
 struct file;
 struct swap_info_struct;