diff mbox

[v7,2/2] Return bytes transferred for partial direct I/O

Message ID 20180208185948.26827-2-rgoldwyn@suse.de (mailing list archive)
State New, archived
Headers show

Commit Message

Goldwyn Rodrigues Feb. 8, 2018, 6:59 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

In case direct I/O encounters an error midway, it returns the error.
Instead it should be returning the number of bytes transferred so far.

Test case for filesystems (with ENOSPC):
1. Create an almost full filesystem
2. Create a file, say /mnt/lastfile, until the filesystem is full.
3. Direct write() with count > sizeof /mnt/lastfile.

Result: write() returns -ENOSPC. However, file content has data written
in step 3.

Added a sysctl entry: dio_short_writes which is on by default. This is
to support applications which expect either and error or the bytes submitted
as a return value for the write calls.

This fixes fstest generic/472.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 Documentation/sysctl/fs.txt | 14 ++++++++++++++
 fs/block_dev.c              |  2 +-
 fs/direct-io.c              |  7 +++++--
 fs/iomap.c                  | 23 ++++++++++++-----------
 include/linux/fs.h          |  1 +
 kernel/sysctl.c             |  9 +++++++++
 6 files changed, 42 insertions(+), 14 deletions(-)

Changes since v1:
 - incorporated iomap and block devices

Changes since v2:
 - realized that file size was not increasing when performing a (partial)
   direct I/O because end_io function was receiving the error instead of
   size. Fixed.

Changes since v3:
 - [hch] initialize transferred with dio->size and use transferred instead
   of dio->size.

Changes since v4:
 - Refreshed to v4.14

Changes since v5:
 - Added /proc/sys/fs/dio_short_writes (default 1) to guard older applications
   which expect write(fd, buf, count) returns either count or error.

Changes since v6:
 - Corrected documentation
 - Re-ordered patch

Comments

Darrick J. Wong March 8, 2018, 12:53 a.m. UTC | #1
On Thu, Feb 08, 2018 at 12:59:48PM -0600, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> In case direct I/O encounters an error midway, it returns the error.
> Instead it should be returning the number of bytes transferred so far.
> 
> Test case for filesystems (with ENOSPC):
> 1. Create an almost full filesystem
> 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> 3. Direct write() with count > sizeof /mnt/lastfile.
> 
> Result: write() returns -ENOSPC. However, file content has data written
> in step 3.
> 
> Added a sysctl entry: dio_short_writes which is on by default. This is
> to support applications which expect either and error or the bytes submitted
> as a return value for the write calls.
> 
> This fixes fstest generic/472.
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  Documentation/sysctl/fs.txt | 14 ++++++++++++++
>  fs/block_dev.c              |  2 +-
>  fs/direct-io.c              |  7 +++++--
>  fs/iomap.c                  | 23 ++++++++++++-----------
>  include/linux/fs.h          |  1 +
>  kernel/sysctl.c             |  9 +++++++++
>  6 files changed, 42 insertions(+), 14 deletions(-)
> 
> Changes since v1:
>  - incorporated iomap and block devices
> 
> Changes since v2:
>  - realized that file size was not increasing when performing a (partial)
>    direct I/O because end_io function was receiving the error instead of
>    size. Fixed.
> 
> Changes since v3:
>  - [hch] initialize transferred with dio->size and use transferred instead
>    of dio->size.
> 
> Changes since v4:
>  - Refreshed to v4.14
> 
> Changes since v5:
>  - Added /proc/sys/fs/dio_short_writes (default 1) to guard older applications
>    which expect write(fd, buf, count) returns either count or error.
> 
> Changes since v6:
>  - Corrected documentation
>  - Re-ordered patch
> 
> diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
> index 6c00c1e2743f..21582f675985 100644
> --- a/Documentation/sysctl/fs.txt
> +++ b/Documentation/sysctl/fs.txt
> @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/fs:
>  - aio-max-nr
>  - aio-nr
>  - dentry-state
> +- dio_short_writes
>  - dquot-max
>  - dquot-nr
>  - file-max
> @@ -76,6 +77,19 @@ dcache isn't pruned yet.
>  
>  ==============================================================
>  
> +dio_short_writes:
> +
> +In case Direct I/O encounters a transient error, it returns
> +the error code, even if it has performed part of the write.
> +This flag, if on (default), will return the number of bytes written
> +so far, as the write(2) semantics are. However, some older applications
> +still consider a direct write as an error if all of the I/O
> +submitted is not complete. I.e. write(file, count, buf) != count.
> +This option can be disabled on systems in order to support
> +existing applications which do not expect short writes.
> +
> +==============================================================
> +
>  dquot-max & dquot-nr:
>  
>  The file dquot-max shows the maximum number of cached disk
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 4a181fcb5175..49d94360bb51 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -409,7 +409,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  
>  	if (!ret)
>  		ret = blk_status_to_errno(dio->bio.bi_status);
> -	if (likely(!ret))
> +	if (likely(dio->size))
>  		ret = dio->size;
>  
>  	bio_put(&dio->bio);
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 3aafb3343a65..9bd15be64c25 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -151,6 +151,7 @@ struct dio {
>  } ____cacheline_aligned_in_smp;
>  
>  static struct kmem_cache *dio_cache __read_mostly;
> +unsigned int sysctl_dio_short_writes = 1;
>  
>  /*
>   * How many pages are in the queue?
> @@ -262,7 +263,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>  		ret = dio->page_errors;
>  	if (ret == 0)
>  		ret = dio->io_error;
> -	if (ret == 0)
> +	if (!sysctl_dio_short_writes && (ret == 0))
>  		ret = transferred;
>  
>  	if (dio->end_io) {
> @@ -310,7 +311,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>  	}
>  
>  	kmem_cache_free(dio_cache, dio);
> -	return ret;
> +	if (!sysctl_dio_short_writes)
> +		return ret;
> +	return transferred ? transferred : ret;
>  }
>  
>  static void dio_aio_complete_work(struct work_struct *work)
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 47d29ccffaef..a8d6908dc0de 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -716,23 +716,24 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  	struct kiocb *iocb = dio->iocb;
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	loff_t offset = iocb->ki_pos;
> -	ssize_t ret;
> +	ssize_t err;
> +	ssize_t transferred = dio->size;

I'm sorry to bring this up again, but there's something not quite right
with this.  Every time iomap_dio_actor create a bio, it increments
dio->size by bio->bi_iter.bi_size before calling submit_bio.  dio->size is
the 'partial' size returned to the caller if there's an error, which
means that if we write a single 2MB bio and it fails, we still get a
partial result of 2MB, not zero.

Analysis of generic/250 bears this out:

total 40960
drwxr-xr-x 2 root root       19 Mar  7 15:59 .
drwxr-xr-x 3 root root       22 Mar  7 15:59 ..
-rw------- 1 root root 41943040 Mar  7 15:59 file2
Filesystem type is: 58465342
File size of /opt/test-250/file2 is 41943040 (10240 blocks of 4096
ytes)
 ext:     logical_offset:        physical_offset: length:   expected:
lags:
   0:        0..     511:         24..       535:    512:
   1:      512..    2047:        536..      2071:   1536: unwritten
   2:     2048..    2048:       2072..      2072:      1:
   3:     2049..    6249:       2073..      6273:   4201: unwritten
   4:     6250..   10239:       6416..     10405:   3990:       6274:
last,unwritten,eof
/opt/test-250/file2: 2 extents found
0000000  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0032768  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69
          i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Note that we wrote 0x69 to the disk prior to mkfs so that if any
unwritten extents were incorrectly converted to real extents we'd detect
it immediately.  This is evidence that we're exposing stale disk
contents.

*
2097152  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
8388608  63  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
          c  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
8388624  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
41943040

I think there's a more serious problem here too.  Let's say userspace
asks for a 4MB dio write and the dio write itself splits into four 1MB
write bios.  bio 0, 2, and 3 return quickly, but bio 1 fails slowly,
which means we successfully wrote 0-1M and 2M-3M, but since we can't
communicate a vector back to userspace the best we can do is return
1048576.

I think this is going to need better state tracking of exactly /what/
succeeded before we can return partial writes to userspace.  This could
be as simple as recording the iomap offset with each bio issued and
reducing dio->size to min(dio->size, bio->iomap->offset) if
bio->bi_status is set in iomap_dio_bio_end_io.

--D

>  
>  	if (dio->end_io) {
> -		ret = dio->end_io(iocb,
> -				dio->error ? dio->error : dio->size,
> -				dio->flags);
> +		err = dio->end_io(iocb,
> +				  (transferred && sysctl_dio_short_writes) ?
> +						transferred : dio->error,
> +				  dio->flags);
>  	} else {
> -		ret = dio->error;
> +		err = dio->error;
>  	}
>  
> -	if (likely(!ret)) {
> -		ret = dio->size;
> +	if (likely(transferred)) {
>  		/* check for short read */
> -		if (offset + ret > dio->i_size &&
> +		if (offset + transferred > dio->i_size &&
>  		    !(dio->flags & IOMAP_DIO_WRITE))
> -			ret = dio->i_size - offset;
> -		iocb->ki_pos += ret;
> +			transferred = dio->i_size - offset;
> +		iocb->ki_pos += transferred;
>  	}
>  
>  	/*
> @@ -759,7 +760,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  	inode_dio_end(file_inode(iocb->ki_filp));
>  	kfree(dio);
>  
> -	return ret;
> +	return (transferred && sysctl_dio_short_writes) ? transferred : err;
>  }
>  
>  static void iomap_dio_complete_work(struct work_struct *work)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 511fbaabf624..a25652e5ae1b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1469,6 +1469,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
>  }
>  
>  extern struct timespec current_time(struct inode *inode);
> +extern unsigned int sysctl_dio_short_writes;
>  
>  /*
>   * Snapshotting support.
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 557d46728577..362a9c3156f1 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1844,6 +1844,15 @@ static struct ctl_table fs_table[] = {
>  		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &one,
>  	},
> +	{
> +		.procname	= "dio_short_writes",
> +		.data		= &sysctl_dio_short_writes,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0600,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
>  	{ }
>  };
>  
> -- 
> 2.16.1
>
Goldwyn Rodrigues March 8, 2018, 3:35 p.m. UTC | #2
On 03/07/2018 06:53 PM, Darrick J. Wong wrote:
> On Thu, Feb 08, 2018 at 12:59:48PM -0600, Goldwyn Rodrigues wrote:
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> In case direct I/O encounters an error midway, it returns the error.
>> Instead it should be returning the number of bytes transferred so far.
>>
>> Test case for filesystems (with ENOSPC):
>> 1. Create an almost full filesystem
>> 2. Create a file, say /mnt/lastfile, until the filesystem is full.
>> 3. Direct write() with count > sizeof /mnt/lastfile.
>>
>> Result: write() returns -ENOSPC. However, file content has data written
>> in step 3.
>>
>> Added a sysctl entry: dio_short_writes which is on by default. This is
>> to support applications which expect either and error or the bytes submitted
>> as a return value for the write calls.
>>
>> This fixes fstest generic/472.
>>
>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>> ---
>>  Documentation/sysctl/fs.txt | 14 ++++++++++++++
>>  fs/block_dev.c              |  2 +-
>>  fs/direct-io.c              |  7 +++++--
>>  fs/iomap.c                  | 23 ++++++++++++-----------
>>  include/linux/fs.h          |  1 +
>>  kernel/sysctl.c             |  9 +++++++++
>>  6 files changed, 42 insertions(+), 14 deletions(-)
>>
>> Changes since v1:
>>  - incorporated iomap and block devices
>>
>> Changes since v2:
>>  - realized that file size was not increasing when performing a (partial)
>>    direct I/O because end_io function was receiving the error instead of
>>    size. Fixed.
>>
>> Changes since v3:
>>  - [hch] initialize transferred with dio->size and use transferred instead
>>    of dio->size.
>>
>> Changes since v4:
>>  - Refreshed to v4.14
>>
>> Changes since v5:
>>  - Added /proc/sys/fs/dio_short_writes (default 1) to guard older applications
>>    which expect write(fd, buf, count) returns either count or error.
>>
>> Changes since v6:
>>  - Corrected documentation
>>  - Re-ordered patch
>>
>> diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
>> index 6c00c1e2743f..21582f675985 100644
>> --- a/Documentation/sysctl/fs.txt
>> +++ b/Documentation/sysctl/fs.txt
>> @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/fs:
>>  - aio-max-nr
>>  - aio-nr
>>  - dentry-state
>> +- dio_short_writes
>>  - dquot-max
>>  - dquot-nr
>>  - file-max
>> @@ -76,6 +77,19 @@ dcache isn't pruned yet.
>>  
>>  ==============================================================
>>  
>> +dio_short_writes:
>> +
>> +In case Direct I/O encounters a transient error, it returns
>> +the error code, even if it has performed part of the write.
>> +This flag, if on (default), will return the number of bytes written
>> +so far, as the write(2) semantics are. However, some older applications
>> +still consider a direct write as an error if all of the I/O
>> +submitted is not complete. I.e. write(file, count, buf) != count.
>> +This option can be disabled on systems in order to support
>> +existing applications which do not expect short writes.
>> +
>> +==============================================================
>> +
>>  dquot-max & dquot-nr:
>>  
>>  The file dquot-max shows the maximum number of cached disk
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 4a181fcb5175..49d94360bb51 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -409,7 +409,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>>  
>>  	if (!ret)
>>  		ret = blk_status_to_errno(dio->bio.bi_status);
>> -	if (likely(!ret))
>> +	if (likely(dio->size))
>>  		ret = dio->size;
>>  
>>  	bio_put(&dio->bio);
>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>> index 3aafb3343a65..9bd15be64c25 100644
>> --- a/fs/direct-io.c
>> +++ b/fs/direct-io.c
>> @@ -151,6 +151,7 @@ struct dio {
>>  } ____cacheline_aligned_in_smp;
>>  
>>  static struct kmem_cache *dio_cache __read_mostly;
>> +unsigned int sysctl_dio_short_writes = 1;
>>  
>>  /*
>>   * How many pages are in the queue?
>> @@ -262,7 +263,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>>  		ret = dio->page_errors;
>>  	if (ret == 0)
>>  		ret = dio->io_error;
>> -	if (ret == 0)
>> +	if (!sysctl_dio_short_writes && (ret == 0))
>>  		ret = transferred;
>>  
>>  	if (dio->end_io) {
>> @@ -310,7 +311,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>>  	}
>>  
>>  	kmem_cache_free(dio_cache, dio);
>> -	return ret;
>> +	if (!sysctl_dio_short_writes)
>> +		return ret;
>> +	return transferred ? transferred : ret;
>>  }
>>  
>>  static void dio_aio_complete_work(struct work_struct *work)
>> diff --git a/fs/iomap.c b/fs/iomap.c
>> index 47d29ccffaef..a8d6908dc0de 100644
>> --- a/fs/iomap.c
>> +++ b/fs/iomap.c
>> @@ -716,23 +716,24 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>>  	struct kiocb *iocb = dio->iocb;
>>  	struct inode *inode = file_inode(iocb->ki_filp);
>>  	loff_t offset = iocb->ki_pos;
>> -	ssize_t ret;
>> +	ssize_t err;
>> +	ssize_t transferred = dio->size;
> 
> I'm sorry to bring this up again, but there's something not quite right
> with this.  Every time iomap_dio_actor create a bio, it increments
> dio->size by bio->bi_iter.bi_size before calling submit_bio.  dio->size is
> the 'partial' size returned to the caller if there's an error, which
> means that if we write a single 2MB bio and it fails, we still get a
> partial result of 2MB, not zero.
> 
> Analysis of generic/250 bears this out:
> 
> total 40960
> drwxr-xr-x 2 root root       19 Mar  7 15:59 .
> drwxr-xr-x 3 root root       22 Mar  7 15:59 ..
> -rw------- 1 root root 41943040 Mar  7 15:59 file2
> Filesystem type is: 58465342
> File size of /opt/test-250/file2 is 41943040 (10240 blocks of 4096
> ytes)
>  ext:     logical_offset:        physical_offset: length:   expected:
> lags:
>    0:        0..     511:         24..       535:    512:
>    1:      512..    2047:        536..      2071:   1536: unwritten
>    2:     2048..    2048:       2072..      2072:      1:
>    3:     2049..    6249:       2073..      6273:   4201: unwritten
>    4:     6250..   10239:       6416..     10405:   3990:       6274:
> last,unwritten,eof
> /opt/test-250/file2: 2 extents found
> 0000000  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
>          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> *
> 0032768  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69
>           i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
>          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Note that we wrote 0x69 to the disk prior to mkfs so that if any
> unwritten extents were incorrectly converted to real extents we'd detect
> it immediately.  This is evidence that we're exposing stale disk
> contents.
> 
> *
> 2097152  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
>          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> *
> 8388608  63  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
>           c  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> 8388624  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
>          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> *
> 41943040
> 
> I think there's a more serious problem here too.  Let's say userspace
> asks for a 4MB dio write and the dio write itself splits into four 1MB
> write bios.  bio 0, 2, and 3 return quickly, but bio 1 fails slowly,
> which means we successfully wrote 0-1M and 2M-3M, but since we can't
> communicate a vector back to userspace the best we can do is return
> 1048576.

Yes, this is a known problem and the only solution I have been told is
to document it. But it the light of what you have expressed earlier, yes
this patch does not make sense. An error in the direct I/O means that
the data in the range may be inconsistent/garbage.

> 
> I think this is going to need better state tracking of exactly /what/
> succeeded before we can return partial writes to userspace.  This could
> be as simple as recording the iomap offset with each bio issued and
> reducing dio->size to min(dio->size, bio->iomap->offset) if
> bio->bi_status is set in iomap_dio_bio_end_io.
> 
What about the rest of the data? Should the user assume that the *rest*
of the data (count - ret) is inconsistent in case of a short write?
Darrick J. Wong March 8, 2018, 5:29 p.m. UTC | #3
On Thu, Mar 08, 2018 at 09:35:48AM -0600, Goldwyn Rodrigues wrote:
> 
> 
> On 03/07/2018 06:53 PM, Darrick J. Wong wrote:
> > On Thu, Feb 08, 2018 at 12:59:48PM -0600, Goldwyn Rodrigues wrote:
> >> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> >>
> >> In case direct I/O encounters an error midway, it returns the error.
> >> Instead it should be returning the number of bytes transferred so far.
> >>
> >> Test case for filesystems (with ENOSPC):
> >> 1. Create an almost full filesystem
> >> 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> >> 3. Direct write() with count > sizeof /mnt/lastfile.
> >>
> >> Result: write() returns -ENOSPC. However, file content has data written
> >> in step 3.
> >>
> >> Added a sysctl entry: dio_short_writes which is on by default. This is
> >> to support applications which expect either and error or the bytes submitted
> >> as a return value for the write calls.
> >>
> >> This fixes fstest generic/472.
> >>
> >> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> >> ---
> >>  Documentation/sysctl/fs.txt | 14 ++++++++++++++
> >>  fs/block_dev.c              |  2 +-
> >>  fs/direct-io.c              |  7 +++++--
> >>  fs/iomap.c                  | 23 ++++++++++++-----------
> >>  include/linux/fs.h          |  1 +
> >>  kernel/sysctl.c             |  9 +++++++++
> >>  6 files changed, 42 insertions(+), 14 deletions(-)
> >>
> >> Changes since v1:
> >>  - incorporated iomap and block devices
> >>
> >> Changes since v2:
> >>  - realized that file size was not increasing when performing a (partial)
> >>    direct I/O because end_io function was receiving the error instead of
> >>    size. Fixed.
> >>
> >> Changes since v3:
> >>  - [hch] initialize transferred with dio->size and use transferred instead
> >>    of dio->size.
> >>
> >> Changes since v4:
> >>  - Refreshed to v4.14
> >>
> >> Changes since v5:
> >>  - Added /proc/sys/fs/dio_short_writes (default 1) to guard older applications
> >>    which expect write(fd, buf, count) returns either count or error.
> >>
> >> Changes since v6:
> >>  - Corrected documentation
> >>  - Re-ordered patch
> >>
> >> diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
> >> index 6c00c1e2743f..21582f675985 100644
> >> --- a/Documentation/sysctl/fs.txt
> >> +++ b/Documentation/sysctl/fs.txt
> >> @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/fs:
> >>  - aio-max-nr
> >>  - aio-nr
> >>  - dentry-state
> >> +- dio_short_writes
> >>  - dquot-max
> >>  - dquot-nr
> >>  - file-max
> >> @@ -76,6 +77,19 @@ dcache isn't pruned yet.
> >>  
> >>  ==============================================================
> >>  
> >> +dio_short_writes:
> >> +
> >> +In case Direct I/O encounters a transient error, it returns
> >> +the error code, even if it has performed part of the write.
> >> +This flag, if on (default), will return the number of bytes written
> >> +so far, as the write(2) semantics are. However, some older applications
> >> +still consider a direct write as an error if all of the I/O
> >> +submitted is not complete. I.e. write(file, count, buf) != count.
> >> +This option can be disabled on systems in order to support
> >> +existing applications which do not expect short writes.
> >> +
> >> +==============================================================
> >> +
> >>  dquot-max & dquot-nr:
> >>  
> >>  The file dquot-max shows the maximum number of cached disk
> >> diff --git a/fs/block_dev.c b/fs/block_dev.c
> >> index 4a181fcb5175..49d94360bb51 100644
> >> --- a/fs/block_dev.c
> >> +++ b/fs/block_dev.c
> >> @@ -409,7 +409,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
> >>  
> >>  	if (!ret)
> >>  		ret = blk_status_to_errno(dio->bio.bi_status);
> >> -	if (likely(!ret))
> >> +	if (likely(dio->size))
> >>  		ret = dio->size;
> >>  
> >>  	bio_put(&dio->bio);
> >> diff --git a/fs/direct-io.c b/fs/direct-io.c
> >> index 3aafb3343a65..9bd15be64c25 100644
> >> --- a/fs/direct-io.c
> >> +++ b/fs/direct-io.c
> >> @@ -151,6 +151,7 @@ struct dio {
> >>  } ____cacheline_aligned_in_smp;
> >>  
> >>  static struct kmem_cache *dio_cache __read_mostly;
> >> +unsigned int sysctl_dio_short_writes = 1;
> >>  
> >>  /*
> >>   * How many pages are in the queue?
> >> @@ -262,7 +263,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
> >>  		ret = dio->page_errors;
> >>  	if (ret == 0)
> >>  		ret = dio->io_error;
> >> -	if (ret == 0)
> >> +	if (!sysctl_dio_short_writes && (ret == 0))
> >>  		ret = transferred;
> >>  
> >>  	if (dio->end_io) {
> >> @@ -310,7 +311,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
> >>  	}
> >>  
> >>  	kmem_cache_free(dio_cache, dio);
> >> -	return ret;
> >> +	if (!sysctl_dio_short_writes)
> >> +		return ret;
> >> +	return transferred ? transferred : ret;
> >>  }
> >>  
> >>  static void dio_aio_complete_work(struct work_struct *work)
> >> diff --git a/fs/iomap.c b/fs/iomap.c
> >> index 47d29ccffaef..a8d6908dc0de 100644
> >> --- a/fs/iomap.c
> >> +++ b/fs/iomap.c
> >> @@ -716,23 +716,24 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> >>  	struct kiocb *iocb = dio->iocb;
> >>  	struct inode *inode = file_inode(iocb->ki_filp);
> >>  	loff_t offset = iocb->ki_pos;
> >> -	ssize_t ret;
> >> +	ssize_t err;
> >> +	ssize_t transferred = dio->size;
> > 
> > I'm sorry to bring this up again, but there's something not quite right
> > with this.  Every time iomap_dio_actor create a bio, it increments
> > dio->size by bio->bi_iter.bi_size before calling submit_bio.  dio->size is
> > the 'partial' size returned to the caller if there's an error, which
> > means that if we write a single 2MB bio and it fails, we still get a
> > partial result of 2MB, not zero.
> > 
> > Analysis of generic/250 bears this out:
> > 
> > total 40960
> > drwxr-xr-x 2 root root       19 Mar  7 15:59 .
> > drwxr-xr-x 3 root root       22 Mar  7 15:59 ..
> > -rw------- 1 root root 41943040 Mar  7 15:59 file2
> > Filesystem type is: 58465342
> > File size of /opt/test-250/file2 is 41943040 (10240 blocks of 4096
> > ytes)
> >  ext:     logical_offset:        physical_offset: length:   expected:
> > lags:
> >    0:        0..     511:         24..       535:    512:
> >    1:      512..    2047:        536..      2071:   1536: unwritten
> >    2:     2048..    2048:       2072..      2072:      1:
> >    3:     2049..    6249:       2073..      6273:   4201: unwritten
> >    4:     6250..   10239:       6416..     10405:   3990:       6274:
> > last,unwritten,eof
> > /opt/test-250/file2: 2 extents found
> > 0000000  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
> >          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> > *
> > 0032768  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69  69
> >           i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
> >          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Note that we wrote 0x69 to the disk prior to mkfs so that if any
> > unwritten extents were incorrectly converted to real extents we'd detect
> > it immediately.  This is evidence that we're exposing stale disk
> > contents.
> > 
> > *
> > 2097152  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
> >          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> > *
> > 8388608  63  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
> >           c  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> > 8388624  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
> >          \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
> > *
> > 41943040
> > 
> > I think there's a more serious problem here too.  Let's say userspace
> > asks for a 4MB dio write and the dio write itself splits into four 1MB
> > write bios.  bio 0, 2, and 3 return quickly, but bio 1 fails slowly,
> > which means we successfully wrote 0-1M and 2M-3M, but since we can't
> > communicate a vector back to userspace the best we can do is return
> > 1048576.
> 
> Yes, this is a known problem and the only solution I have been told is
> to document it.

<nod> I think I'm comfortable returning the offset of the first failing
io as a lower bound for "bytes written".  Presumably the file pointer
will be advanced by that amount and the operation retried from there, if
the program cares to do so.

> But it the light of what you have expressed earlier, yes
> this patch does not make sense. An error in the direct I/O means that
> the data in the range may be inconsistent/garbage.
> 
> > 
> > I think this is going to need better state tracking of exactly /what/
> > succeeded before we can return partial writes to userspace.  This could
> > be as simple as recording the iomap offset with each bio issued and
> > reducing dio->size to min(dio->size, bio->iomap->offset) if
> > bio->bi_status is set in iomap_dio_bio_end_io.
> > 
> What about the rest of the data? Should the user assume that the *rest*
> of the data (count - ret) is inconsistent in case of a short write?

Well, a narrow reading of the spec would be that we only made a
statement about the range (offset, offset + ret), so who cares what
happened past that? :P  But it does break the model that "write says it
wrote X bytes, so everything past X remains as it was before"... but
this is directio where everything is upside down. :P

--D

> 
> -- 
> Goldwyn
Theodore Ts'o May 20, 2018, 1:29 a.m. UTC | #4
On Thu, Feb 08, 2018 at 12:59:48PM -0600, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> In case direct I/O encounters an error midway, it returns the error.
> Instead it should be returning the number of bytes transferred so far.
> 
> Test case for filesystems (with ENOSPC):
> 1. Create an almost full filesystem
> 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> 3. Direct write() with count > sizeof /mnt/lastfile.
> 
> Result: write() returns -ENOSPC. However, file content has data written
> in step 3.
> 
> Added a sysctl entry: dio_short_writes which is on by default. This is
> to support applications which expect either and error or the bytes submitted
> as a return value for the write calls.
> 
> This fixes fstest generic/472.
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>

Hi,

I was wondering if you could give an update regarding what's up with
this patch?

There doesn't seem to be any movement on this patch in a while, and so
I still have xfstests generic/472 suppressed in {kvm,gce}-xfstests.

Thanks,

						- Ted
diff mbox

Patch

diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 6c00c1e2743f..21582f675985 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -22,6 +22,7 @@  Currently, these files are in /proc/sys/fs:
 - aio-max-nr
 - aio-nr
 - dentry-state
+- dio_short_writes
 - dquot-max
 - dquot-nr
 - file-max
@@ -76,6 +77,19 @@  dcache isn't pruned yet.
 
 ==============================================================
 
+dio_short_writes:
+
+In case Direct I/O encounters a transient error, it returns
+the error code, even if it has performed part of the write.
+This flag, if on (default), will return the number of bytes written
+so far, as the write(2) semantics are. However, some older applications
+still consider a direct write as an error if all of the I/O
+submitted is not complete. I.e. write(file, count, buf) != count.
+This option can be disabled on systems in order to support
+existing applications which do not expect short writes.
+
+==============================================================
+
 dquot-max & dquot-nr:
 
 The file dquot-max shows the maximum number of cached disk
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..49d94360bb51 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -409,7 +409,7 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
-	if (likely(!ret))
+	if (likely(dio->size))
 		ret = dio->size;
 
 	bio_put(&dio->bio);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3aafb3343a65..9bd15be64c25 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -151,6 +151,7 @@  struct dio {
 } ____cacheline_aligned_in_smp;
 
 static struct kmem_cache *dio_cache __read_mostly;
+unsigned int sysctl_dio_short_writes = 1;
 
 /*
  * How many pages are in the queue?
@@ -262,7 +263,7 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 		ret = dio->page_errors;
 	if (ret == 0)
 		ret = dio->io_error;
-	if (ret == 0)
+	if (!sysctl_dio_short_writes && (ret == 0))
 		ret = transferred;
 
 	if (dio->end_io) {
@@ -310,7 +311,9 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 	}
 
 	kmem_cache_free(dio_cache, dio);
-	return ret;
+	if (!sysctl_dio_short_writes)
+		return ret;
+	return transferred ? transferred : ret;
 }
 
 static void dio_aio_complete_work(struct work_struct *work)
diff --git a/fs/iomap.c b/fs/iomap.c
index 47d29ccffaef..a8d6908dc0de 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -716,23 +716,24 @@  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	struct kiocb *iocb = dio->iocb;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
+	ssize_t err;
+	ssize_t transferred = dio->size;
 
 	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
+		err = dio->end_io(iocb,
+				  (transferred && sysctl_dio_short_writes) ?
+						transferred : dio->error,
+				  dio->flags);
 	} else {
-		ret = dio->error;
+		err = dio->error;
 	}
 
-	if (likely(!ret)) {
-		ret = dio->size;
+	if (likely(transferred)) {
 		/* check for short read */
-		if (offset + ret > dio->i_size &&
+		if (offset + transferred > dio->i_size &&
 		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
+			transferred = dio->i_size - offset;
+		iocb->ki_pos += transferred;
 	}
 
 	/*
@@ -759,7 +760,7 @@  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	inode_dio_end(file_inode(iocb->ki_filp));
 	kfree(dio);
 
-	return ret;
+	return (transferred && sysctl_dio_short_writes) ? transferred : err;
 }
 
 static void iomap_dio_complete_work(struct work_struct *work)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 511fbaabf624..a25652e5ae1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1469,6 +1469,7 @@  static inline void i_gid_write(struct inode *inode, gid_t gid)
 }
 
 extern struct timespec current_time(struct inode *inode);
+extern unsigned int sysctl_dio_short_writes;
 
 /*
  * Snapshotting support.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 557d46728577..362a9c3156f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1844,6 +1844,15 @@  static struct ctl_table fs_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "dio_short_writes",
+		.data		= &sysctl_dio_short_writes,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };