[5/5] iomap: support RWF_UNCACHED for buffered writes
diff mbox series

Message ID 20191211152943.2933-6-axboe@kernel.dk
State New
Headers show
Series
  • Support for RWF_UNCACHED
Related show

Commit Message

Jens Axboe Dec. 11, 2019, 3:29 p.m. UTC
This adds support for RWF_UNCACHED for file systems using iomap to
perform buffered writes. We use the generic infrastructure for this,
by tracking pages we created and calling write_drop_cached_pages()
to issue writeback and prune those pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
 fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
 include/linux/iomap.h  |  5 +++++
 3 files changed, 58 insertions(+), 8 deletions(-)

Comments

Matthew Wilcox Dec. 11, 2019, 5:19 p.m. UTC | #1
On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
> @@ -670,9 +675,14 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
>  		iomap_read_inline_data(inode, page, srcmap);
>  	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
>  		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
> -	else
> -		status = __iomap_write_begin(inode, pos, len, flags, page,
> +	else {
> +		unsigned wb_flags = 0;
> +
> +		if (flags & IOMAP_UNCACHED)
> +			wb_flags = IOMAP_WRITE_F_UNCACHED;
> +		status = __iomap_write_begin(inode, pos, len, wb_flags, page,
>  				srcmap);

I think if you do an uncached write to a currently shared extent, you
just lost the IOMAP_WRITE_F_UNSHARE flag?
Jens Axboe Dec. 11, 2019, 6:05 p.m. UTC | #2
On 12/11/19 10:19 AM, Matthew Wilcox wrote:
> On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
>> @@ -670,9 +675,14 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
>>  		iomap_read_inline_data(inode, page, srcmap);
>>  	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
>>  		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
>> -	else
>> -		status = __iomap_write_begin(inode, pos, len, flags, page,
>> +	else {
>> +		unsigned wb_flags = 0;
>> +
>> +		if (flags & IOMAP_UNCACHED)
>> +			wb_flags = IOMAP_WRITE_F_UNCACHED;
>> +		status = __iomap_write_begin(inode, pos, len, wb_flags, page,
>>  				srcmap);
> 
> I think if you do an uncached write to a currently shared extent, you
> just lost the IOMAP_WRITE_F_UNSHARE flag?

Oops good catch, I'll fix that up.
Dave Chinner Dec. 12, 2019, 10:34 p.m. UTC | #3
On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
> This adds support for RWF_UNCACHED for file systems using iomap to
> perform buffered writes. We use the generic infrastructure for this,
> by tracking pages we created and calling write_drop_cached_pages()
> to issue writeback and prune those pages.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
>  fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
>  include/linux/iomap.h  |  5 +++++
>  3 files changed, 58 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
> index 562536da8a13..966826ad4bb9 100644
> --- a/fs/iomap/apply.c
> +++ b/fs/iomap/apply.c
> @@ -90,5 +90,29 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
>  				     flags, &iomap);
>  	}
>  
> +	if (written && (flags & IOMAP_UNCACHED)) {
> +		struct address_space *mapping = inode->i_mapping;
> +
> +		end = pos + written;
> +		ret = filemap_write_and_wait_range(mapping, pos, end);
> +		if (ret)
> +			goto out;
> +
> +		/*
> +		 * No pages were created for this range, we're done
> +		 */
> +		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
> +			goto out;
> +
> +		/*
> +		 * Try to invalidate cache pages for the range we just wrote.
> +		 * We don't care if invalidation fails as the write has still
> +		 * worked and leaving clean uptodate pages in the page cache
> +		 * isn't a corruption vector for uncached IO.
> +		 */
> +		invalidate_inode_pages2_range(mapping,
> +				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
> +	}
> +out:
>  	return written ? written : ret;
>  }

Just a thought on further optimisation for this for XFS.
IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
methods by iomap_apply().  Hence the filesystems know that it is
an uncached IO that is being done, and we can tailor allocation
strategies to suit the fact that the data is going to be written
immediately.

In this case, XFS needs to treat it the same way it treats direct
IO. That is, we do immediate unwritten extent allocation rather than
delayed allocation. This will reduce the allocation overhead and
will optimise for immediate IO locality rather than optimise for
delayed allocation.

This should just be a relatively simple change to
xfs_file_iomap_begin() along the lines of:

-	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
-			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
+	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
+	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
		/* Reserve delalloc blocks for regular writeback. */
		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
				iomap);
	}

so that it avoids delayed allocation for uncached IO...

Cheers,

Dave.
Jens Axboe Dec. 13, 2019, 12:54 a.m. UTC | #4
On 12/12/19 3:34 PM, Dave Chinner wrote:
> On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
>> This adds support for RWF_UNCACHED for file systems using iomap to
>> perform buffered writes. We use the generic infrastructure for this,
>> by tracking pages we created and calling write_drop_cached_pages()
>> to issue writeback and prune those pages.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
>>  fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
>>  include/linux/iomap.h  |  5 +++++
>>  3 files changed, 58 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
>> index 562536da8a13..966826ad4bb9 100644
>> --- a/fs/iomap/apply.c
>> +++ b/fs/iomap/apply.c
>> @@ -90,5 +90,29 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
>>  				     flags, &iomap);
>>  	}
>>  
>> +	if (written && (flags & IOMAP_UNCACHED)) {
>> +		struct address_space *mapping = inode->i_mapping;
>> +
>> +		end = pos + written;
>> +		ret = filemap_write_and_wait_range(mapping, pos, end);
>> +		if (ret)
>> +			goto out;
>> +
>> +		/*
>> +		 * No pages were created for this range, we're done
>> +		 */
>> +		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
>> +			goto out;
>> +
>> +		/*
>> +		 * Try to invalidate cache pages for the range we just wrote.
>> +		 * We don't care if invalidation fails as the write has still
>> +		 * worked and leaving clean uptodate pages in the page cache
>> +		 * isn't a corruption vector for uncached IO.
>> +		 */
>> +		invalidate_inode_pages2_range(mapping,
>> +				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
>> +	}
>> +out:
>>  	return written ? written : ret;
>>  }
> 
> Just a thought on further optimisation for this for XFS.
> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
> methods by iomap_apply().  Hence the filesystems know that it is
> an uncached IO that is being done, and we can tailor allocation
> strategies to suit the fact that the data is going to be written
> immediately.
> 
> In this case, XFS needs to treat it the same way it treats direct
> IO. That is, we do immediate unwritten extent allocation rather than
> delayed allocation. This will reduce the allocation overhead and
> will optimise for immediate IO locality rather than optimise for
> delayed allocation.
> 
> This should just be a relatively simple change to
> xfs_file_iomap_begin() along the lines of:
> 
> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> 		/* Reserve delalloc blocks for regular writeback. */
> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
> 				iomap);
> 	}
> 
> so that it avoids delayed allocation for uncached IO...

That's very handy! Thanks, I'll add that to the next version. Just out
of curiosity, would you prefer this as a separate patch, or just bundle
it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
and I'll just mention it in the changelog.
Jens Axboe Dec. 13, 2019, 12:57 a.m. UTC | #5
On 12/12/19 5:54 PM, Jens Axboe wrote:
> On 12/12/19 3:34 PM, Dave Chinner wrote:
>> On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
>>> This adds support for RWF_UNCACHED for file systems using iomap to
>>> perform buffered writes. We use the generic infrastructure for this,
>>> by tracking pages we created and calling write_drop_cached_pages()
>>> to issue writeback and prune those pages.
>>>
>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> ---
>>>  fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
>>>  fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
>>>  include/linux/iomap.h  |  5 +++++
>>>  3 files changed, 58 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
>>> index 562536da8a13..966826ad4bb9 100644
>>> --- a/fs/iomap/apply.c
>>> +++ b/fs/iomap/apply.c
>>> @@ -90,5 +90,29 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
>>>  				     flags, &iomap);
>>>  	}
>>>  
>>> +	if (written && (flags & IOMAP_UNCACHED)) {
>>> +		struct address_space *mapping = inode->i_mapping;
>>> +
>>> +		end = pos + written;
>>> +		ret = filemap_write_and_wait_range(mapping, pos, end);
>>> +		if (ret)
>>> +			goto out;
>>> +
>>> +		/*
>>> +		 * No pages were created for this range, we're done
>>> +		 */
>>> +		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
>>> +			goto out;
>>> +
>>> +		/*
>>> +		 * Try to invalidate cache pages for the range we just wrote.
>>> +		 * We don't care if invalidation fails as the write has still
>>> +		 * worked and leaving clean uptodate pages in the page cache
>>> +		 * isn't a corruption vector for uncached IO.
>>> +		 */
>>> +		invalidate_inode_pages2_range(mapping,
>>> +				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
>>> +	}
>>> +out:
>>>  	return written ? written : ret;
>>>  }
>>
>> Just a thought on further optimisation for this for XFS.
>> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
>> methods by iomap_apply().  Hence the filesystems know that it is
>> an uncached IO that is being done, and we can tailor allocation
>> strategies to suit the fact that the data is going to be written
>> immediately.
>>
>> In this case, XFS needs to treat it the same way it treats direct
>> IO. That is, we do immediate unwritten extent allocation rather than
>> delayed allocation. This will reduce the allocation overhead and
>> will optimise for immediate IO locality rather than optimise for
>> delayed allocation.
>>
>> This should just be a relatively simple change to
>> xfs_file_iomap_begin() along the lines of:
>>
>> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
>> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
>> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
>> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>> 		/* Reserve delalloc blocks for regular writeback. */
>> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
>> 				iomap);
>> 	}
>>
>> so that it avoids delayed allocation for uncached IO...
> 
> That's very handy! Thanks, I'll add that to the next version. Just out
> of curiosity, would you prefer this as a separate patch, or just bundle
> it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
> and I'll just mention it in the changelog.

OK, since it's in XFS, it'd be a separate patch. The code you quote seems
to be something out-of-tree?
Dave Chinner Dec. 16, 2019, 4:17 a.m. UTC | #6
On Thu, Dec 12, 2019 at 05:57:57PM -0700, Jens Axboe wrote:
> On 12/12/19 5:54 PM, Jens Axboe wrote:
> > On 12/12/19 3:34 PM, Dave Chinner wrote:
> >> On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
> >>> This adds support for RWF_UNCACHED for file systems using iomap to
> >>> perform buffered writes. We use the generic infrastructure for this,
> >>> by tracking pages we created and calling write_drop_cached_pages()
> >>> to issue writeback and prune those pages.
> >>>
> >>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >>> ---
> >>>  fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
> >>>  fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
> >>>  include/linux/iomap.h  |  5 +++++
> >>>  3 files changed, 58 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
> >>> index 562536da8a13..966826ad4bb9 100644
> >>> --- a/fs/iomap/apply.c
> >>> +++ b/fs/iomap/apply.c
> >>> @@ -90,5 +90,29 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
> >>>  				     flags, &iomap);
> >>>  	}
> >>>  
> >>> +	if (written && (flags & IOMAP_UNCACHED)) {
> >>> +		struct address_space *mapping = inode->i_mapping;
> >>> +
> >>> +		end = pos + written;
> >>> +		ret = filemap_write_and_wait_range(mapping, pos, end);
> >>> +		if (ret)
> >>> +			goto out;
> >>> +
> >>> +		/*
> >>> +		 * No pages were created for this range, we're done
> >>> +		 */
> >>> +		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
> >>> +			goto out;
> >>> +
> >>> +		/*
> >>> +		 * Try to invalidate cache pages for the range we just wrote.
> >>> +		 * We don't care if invalidation fails as the write has still
> >>> +		 * worked and leaving clean uptodate pages in the page cache
> >>> +		 * isn't a corruption vector for uncached IO.
> >>> +		 */
> >>> +		invalidate_inode_pages2_range(mapping,
> >>> +				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
> >>> +	}
> >>> +out:
> >>>  	return written ? written : ret;
> >>>  }
> >>
> >> Just a thought on further optimisation for this for XFS.
> >> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
> >> methods by iomap_apply().  Hence the filesystems know that it is
> >> an uncached IO that is being done, and we can tailor allocation
> >> strategies to suit the fact that the data is going to be written
> >> immediately.
> >>
> >> In this case, XFS needs to treat it the same way it treats direct
> >> IO. That is, we do immediate unwritten extent allocation rather than
> >> delayed allocation. This will reduce the allocation overhead and
> >> will optimise for immediate IO locality rather than optimise for
> >> delayed allocation.
> >>
> >> This should just be a relatively simple change to
> >> xfs_file_iomap_begin() along the lines of:
> >>
> >> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
> >> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> >> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
> >> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
> >> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> >> 		/* Reserve delalloc blocks for regular writeback. */
> >> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
> >> 				iomap);
> >> 	}
> >>
> >> so that it avoids delayed allocation for uncached IO...
> > 
> > That's very handy! Thanks, I'll add that to the next version. Just out
> > of curiosity, would you prefer this as a separate patch, or just bundle
> > it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
> > and I'll just mention it in the changelog.
> 
> OK, since it's in XFS, it'd be a separate patch.

*nod*

> The code you quote seems
> to be something out-of-tree?

Ah, I quoted the code in the 5.4 release branch, not the 5.5-rc1
tree. I'd forgotten that the xfs_file_iomap_begin() got massively
refactored in the 5.5 merge and I hadn't updated my cscope trees. SO
I'm guessing you want to go looking for the
xfs_buffered_write_iomap_begin() and add another case to this
initial branch:

        /* we can't use delayed allocations when using extent size hints */
        if (xfs_get_extsz_hint(ip))
                return xfs_direct_write_iomap_begin(inode, offset, count,
                                flags, iomap, srcmap);

To make the buffered write IO go down the direct IO allocation path...

Cheers,

Dave.
Jens Axboe Dec. 17, 2019, 2:31 p.m. UTC | #7
On 12/15/19 9:17 PM, Dave Chinner wrote:
> On Thu, Dec 12, 2019 at 05:57:57PM -0700, Jens Axboe wrote:
>> On 12/12/19 5:54 PM, Jens Axboe wrote:
>>> On 12/12/19 3:34 PM, Dave Chinner wrote:
>>>> On Wed, Dec 11, 2019 at 08:29:43AM -0700, Jens Axboe wrote:
>>>>> This adds support for RWF_UNCACHED for file systems using iomap to
>>>>> perform buffered writes. We use the generic infrastructure for this,
>>>>> by tracking pages we created and calling write_drop_cached_pages()
>>>>> to issue writeback and prune those pages.
>>>>>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>>  fs/iomap/apply.c       | 24 ++++++++++++++++++++++++
>>>>>  fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++--------
>>>>>  include/linux/iomap.h  |  5 +++++
>>>>>  3 files changed, 58 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
>>>>> index 562536da8a13..966826ad4bb9 100644
>>>>> --- a/fs/iomap/apply.c
>>>>> +++ b/fs/iomap/apply.c
>>>>> @@ -90,5 +90,29 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
>>>>>  				     flags, &iomap);
>>>>>  	}
>>>>>  
>>>>> +	if (written && (flags & IOMAP_UNCACHED)) {
>>>>> +		struct address_space *mapping = inode->i_mapping;
>>>>> +
>>>>> +		end = pos + written;
>>>>> +		ret = filemap_write_and_wait_range(mapping, pos, end);
>>>>> +		if (ret)
>>>>> +			goto out;
>>>>> +
>>>>> +		/*
>>>>> +		 * No pages were created for this range, we're done
>>>>> +		 */
>>>>> +		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
>>>>> +			goto out;
>>>>> +
>>>>> +		/*
>>>>> +		 * Try to invalidate cache pages for the range we just wrote.
>>>>> +		 * We don't care if invalidation fails as the write has still
>>>>> +		 * worked and leaving clean uptodate pages in the page cache
>>>>> +		 * isn't a corruption vector for uncached IO.
>>>>> +		 */
>>>>> +		invalidate_inode_pages2_range(mapping,
>>>>> +				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
>>>>> +	}
>>>>> +out:
>>>>>  	return written ? written : ret;
>>>>>  }
>>>>
>>>> Just a thought on further optimisation for this for XFS.
>>>> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
>>>> methods by iomap_apply().  Hence the filesystems know that it is
>>>> an uncached IO that is being done, and we can tailor allocation
>>>> strategies to suit the fact that the data is going to be written
>>>> immediately.
>>>>
>>>> In this case, XFS needs to treat it the same way it treats direct
>>>> IO. That is, we do immediate unwritten extent allocation rather than
>>>> delayed allocation. This will reduce the allocation overhead and
>>>> will optimise for immediate IO locality rather than optimise for
>>>> delayed allocation.
>>>>
>>>> This should just be a relatively simple change to
>>>> xfs_file_iomap_begin() along the lines of:
>>>>
>>>> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
>>>> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>>>> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
>>>> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
>>>> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>>>> 		/* Reserve delalloc blocks for regular writeback. */
>>>> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
>>>> 				iomap);
>>>> 	}
>>>>
>>>> so that it avoids delayed allocation for uncached IO...
>>>
>>> That's very handy! Thanks, I'll add that to the next version. Just out
>>> of curiosity, would you prefer this as a separate patch, or just bundle
>>> it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
>>> and I'll just mention it in the changelog.
>>
>> OK, since it's in XFS, it'd be a separate patch.
> 
> *nod*
> 
>> The code you quote seems
>> to be something out-of-tree?
> 
> Ah, I quoted the code in the 5.4 release branch, not the 5.5-rc1
> tree. I'd forgotten that the xfs_file_iomap_begin() got massively
> refactored in the 5.5 merge and I hadn't updated my cscope trees. SO
> I'm guessing you want to go looking for the
> xfs_buffered_write_iomap_begin() and add another case to this
> initial branch:
> 
>         /* we can't use delayed allocations when using extent size hints */
>         if (xfs_get_extsz_hint(ip))
>                 return xfs_direct_write_iomap_begin(inode, offset, count,
>                                 flags, iomap, srcmap);
> 
> To make the buffered write IO go down the direct IO allocation path...

Makes it even simpler! Something like this:


commit 1783722cd4b7088a3c004462c7ae610b8e42b720
Author: Jens Axboe <axboe@kernel.dk>
Date:   Tue Dec 17 07:30:04 2019 -0700

    xfs: don't do delayed allocations for uncached buffered writes
    
    This data is going to be written immediately, so don't bother trying
    to do delayed allocation for it.
    
    Suggested-by: Dave Chinner <david@fromorbit.com>
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 28e2d1f37267..d0cd4a05d59f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -847,8 +847,11 @@ xfs_buffered_write_iomap_begin(
 	int			allocfork = XFS_DATA_FORK;
 	int			error = 0;
 
-	/* we can't use delayed allocations when using extent size hints */
-	if (xfs_get_extsz_hint(ip))
+	/*
+	 * Don't do delayed allocations when using extent size hints, or
+	 * if we were asked to do uncached buffered writes.
+	 */
+	if (xfs_get_extsz_hint(ip) || (flags & IOMAP_UNCACHED))
 		return xfs_direct_write_iomap_begin(inode, offset, count,
 				flags, iomap, srcmap);
Dave Chinner Dec. 18, 2019, 12:49 a.m. UTC | #8
On Tue, Dec 17, 2019 at 07:31:51AM -0700, Jens Axboe wrote:
> On 12/15/19 9:17 PM, Dave Chinner wrote:
> > On Thu, Dec 12, 2019 at 05:57:57PM -0700, Jens Axboe wrote:
> >> On 12/12/19 5:54 PM, Jens Axboe wrote:
> >>> On 12/12/19 3:34 PM, Dave Chinner wrote:
> >>>> Just a thought on further optimisation for this for XFS.
> >>>> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
> >>>> methods by iomap_apply().  Hence the filesystems know that it is
> >>>> an uncached IO that is being done, and we can tailor allocation
> >>>> strategies to suit the fact that the data is going to be written
> >>>> immediately.
> >>>>
> >>>> In this case, XFS needs to treat it the same way it treats direct
> >>>> IO. That is, we do immediate unwritten extent allocation rather than
> >>>> delayed allocation. This will reduce the allocation overhead and
> >>>> will optimise for immediate IO locality rather than optimise for
> >>>> delayed allocation.
> >>>>
> >>>> This should just be a relatively simple change to
> >>>> xfs_file_iomap_begin() along the lines of:
> >>>>
> >>>> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
> >>>> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> >>>> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
> >>>> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
> >>>> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
> >>>> 		/* Reserve delalloc blocks for regular writeback. */
> >>>> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
> >>>> 				iomap);
> >>>> 	}
> >>>>
> >>>> so that it avoids delayed allocation for uncached IO...
> >>>
> >>> That's very handy! Thanks, I'll add that to the next version. Just out
> >>> of curiosity, would you prefer this as a separate patch, or just bundle
> >>> it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
> >>> and I'll just mention it in the changelog.
> >>
> >> OK, since it's in XFS, it'd be a separate patch.
> > 
> > *nod*
> > 
> >> The code you quote seems
> >> to be something out-of-tree?
> > 
> > Ah, I quoted the code in the 5.4 release branch, not the 5.5-rc1
> > tree. I'd forgotten that the xfs_file_iomap_begin() got massively
> > refactored in the 5.5 merge and I hadn't updated my cscope trees. SO
> > I'm guessing you want to go looking for the
> > xfs_buffered_write_iomap_begin() and add another case to this
> > initial branch:
> > 
> >         /* we can't use delayed allocations when using extent size hints */
> >         if (xfs_get_extsz_hint(ip))
> >                 return xfs_direct_write_iomap_begin(inode, offset, count,
> >                                 flags, iomap, srcmap);
> > 
> > To make the buffered write IO go down the direct IO allocation path...
> 
> Makes it even simpler! Something like this:
> 
> 
> commit 1783722cd4b7088a3c004462c7ae610b8e42b720
> Author: Jens Axboe <axboe@kernel.dk>
> Date:   Tue Dec 17 07:30:04 2019 -0700
> 
>     xfs: don't do delayed allocations for uncached buffered writes
>     
>     This data is going to be written immediately, so don't bother trying
>     to do delayed allocation for it.
>     
>     Suggested-by: Dave Chinner <david@fromorbit.com>
>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 28e2d1f37267..d0cd4a05d59f 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -847,8 +847,11 @@ xfs_buffered_write_iomap_begin(
>  	int			allocfork = XFS_DATA_FORK;
>  	int			error = 0;
>  
> -	/* we can't use delayed allocations when using extent size hints */
> -	if (xfs_get_extsz_hint(ip))
> +	/*
> +	 * Don't do delayed allocations when using extent size hints, or
> +	 * if we were asked to do uncached buffered writes.
> +	 */
> +	if (xfs_get_extsz_hint(ip) || (flags & IOMAP_UNCACHED))
>  		return xfs_direct_write_iomap_begin(inode, offset, count,
>  				flags, iomap, srcmap);
>  

Yup, that's pretty much what I was thinking. :)

Cheers,

Dave.
Jens Axboe Dec. 18, 2019, 1:01 a.m. UTC | #9
On 12/17/19 5:49 PM, Dave Chinner wrote:
> On Tue, Dec 17, 2019 at 07:31:51AM -0700, Jens Axboe wrote:
>> On 12/15/19 9:17 PM, Dave Chinner wrote:
>>> On Thu, Dec 12, 2019 at 05:57:57PM -0700, Jens Axboe wrote:
>>>> On 12/12/19 5:54 PM, Jens Axboe wrote:
>>>>> On 12/12/19 3:34 PM, Dave Chinner wrote:
>>>>>> Just a thought on further optimisation for this for XFS.
>>>>>> IOMAP_UNCACHED is being passed into the filesystem ->iomap_begin
>>>>>> methods by iomap_apply().  Hence the filesystems know that it is
>>>>>> an uncached IO that is being done, and we can tailor allocation
>>>>>> strategies to suit the fact that the data is going to be written
>>>>>> immediately.
>>>>>>
>>>>>> In this case, XFS needs to treat it the same way it treats direct
>>>>>> IO. That is, we do immediate unwritten extent allocation rather than
>>>>>> delayed allocation. This will reduce the allocation overhead and
>>>>>> will optimise for immediate IO locality rather than optimise for
>>>>>> delayed allocation.
>>>>>>
>>>>>> This should just be a relatively simple change to
>>>>>> xfs_file_iomap_begin() along the lines of:
>>>>>>
>>>>>> -	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
>>>>>> -			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>>>>>> +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) &&
>>>>>> +	    !(flags & (IOMAP_DIRECT | IOMAP_UNCACHED)) &&
>>>>>> +	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>>>>>> 		/* Reserve delalloc blocks for regular writeback. */
>>>>>> 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
>>>>>> 				iomap);
>>>>>> 	}
>>>>>>
>>>>>> so that it avoids delayed allocation for uncached IO...
>>>>>
>>>>> That's very handy! Thanks, I'll add that to the next version. Just out
>>>>> of curiosity, would you prefer this as a separate patch, or just bundle
>>>>> it with the iomap buffered RWF_UNCACHED patch? I'm assuming the latter,
>>>>> and I'll just mention it in the changelog.
>>>>
>>>> OK, since it's in XFS, it'd be a separate patch.
>>>
>>> *nod*
>>>
>>>> The code you quote seems
>>>> to be something out-of-tree?
>>>
>>> Ah, I quoted the code in the 5.4 release branch, not the 5.5-rc1
>>> tree. I'd forgotten that the xfs_file_iomap_begin() got massively
>>> refactored in the 5.5 merge and I hadn't updated my cscope trees. SO
>>> I'm guessing you want to go looking for the
>>> xfs_buffered_write_iomap_begin() and add another case to this
>>> initial branch:
>>>
>>>         /* we can't use delayed allocations when using extent size hints */
>>>         if (xfs_get_extsz_hint(ip))
>>>                 return xfs_direct_write_iomap_begin(inode, offset, count,
>>>                                 flags, iomap, srcmap);
>>>
>>> To make the buffered write IO go down the direct IO allocation path...
>>
>> Makes it even simpler! Something like this:
>>
>>
>> commit 1783722cd4b7088a3c004462c7ae610b8e42b720
>> Author: Jens Axboe <axboe@kernel.dk>
>> Date:   Tue Dec 17 07:30:04 2019 -0700
>>
>>     xfs: don't do delayed allocations for uncached buffered writes
>>     
>>     This data is going to be written immediately, so don't bother trying
>>     to do delayed allocation for it.
>>     
>>     Suggested-by: Dave Chinner <david@fromorbit.com>
>>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>
>> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
>> index 28e2d1f37267..d0cd4a05d59f 100644
>> --- a/fs/xfs/xfs_iomap.c
>> +++ b/fs/xfs/xfs_iomap.c
>> @@ -847,8 +847,11 @@ xfs_buffered_write_iomap_begin(
>>  	int			allocfork = XFS_DATA_FORK;
>>  	int			error = 0;
>>  
>> -	/* we can't use delayed allocations when using extent size hints */
>> -	if (xfs_get_extsz_hint(ip))
>> +	/*
>> +	 * Don't do delayed allocations when using extent size hints, or
>> +	 * if we were asked to do uncached buffered writes.
>> +	 */
>> +	if (xfs_get_extsz_hint(ip) || (flags & IOMAP_UNCACHED))
>>  		return xfs_direct_write_iomap_begin(inode, offset, count,
>>  				flags, iomap, srcmap);
>>  
> 
> Yup, that's pretty much what I was thinking. :)

Perfect, thanks for checking!

Patch
diff mbox series

diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 562536da8a13..966826ad4bb9 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -90,5 +90,29 @@  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 				     flags, &iomap);
 	}
 
+	if (written && (flags & IOMAP_UNCACHED)) {
+		struct address_space *mapping = inode->i_mapping;
+
+		end = pos + written;
+		ret = filemap_write_and_wait_range(mapping, pos, end);
+		if (ret)
+			goto out;
+
+		/*
+		 * No pages were created for this range, we're done
+		 */
+		if (!(iomap.flags & IOMAP_F_PAGE_CREATE))
+			goto out;
+
+		/*
+		 * Try to invalidate cache pages for the range we just wrote.
+		 * We don't care if invalidation fails as the write has still
+		 * worked and leaving clean uptodate pages in the page cache
+		 * isn't a corruption vector for uncached IO.
+		 */
+		invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	}
+out:
 	return written ? written : ret;
 }
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9b5b770ca4c7..09440f114506 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -566,6 +566,7 @@  EXPORT_SYMBOL_GPL(iomap_migrate_page);
 
 enum {
 	IOMAP_WRITE_F_UNSHARE		= (1 << 0),
+	IOMAP_WRITE_F_UNCACHED		= (1 << 1),
 };
 
 static void
@@ -643,6 +644,7 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
 {
 	const struct iomap_page_ops *page_ops = iomap->page_ops;
+	unsigned aop_flags;
 	struct page *page;
 	int status = 0;
 
@@ -659,8 +661,11 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 			return status;
 	}
 
+	aop_flags = AOP_FLAG_NOFS;
+	if (flags & IOMAP_UNCACHED)
+		aop_flags |= AOP_FLAG_UNCACHED;
 	page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
-			AOP_FLAG_NOFS);
+						aop_flags);
 	if (!page) {
 		status = -ENOMEM;
 		goto out_no_page;
@@ -670,9 +675,14 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		iomap_read_inline_data(inode, page, srcmap);
 	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
 		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
-	else
-		status = __iomap_write_begin(inode, pos, len, flags, page,
+	else {
+		unsigned wb_flags = 0;
+
+		if (flags & IOMAP_UNCACHED)
+			wb_flags = IOMAP_WRITE_F_UNCACHED;
+		status = __iomap_write_begin(inode, pos, len, wb_flags, page,
 				srcmap);
+	}
 
 	if (unlikely(status))
 		goto out_unlock;
@@ -832,10 +842,17 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			break;
 		}
 
-		status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
-				srcmap);
-		if (unlikely(status))
+retry:
+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
+						iomap, srcmap);
+		if (unlikely(status)) {
+			if (status == -ENOMEM && (flags & IOMAP_UNCACHED)) {
+				iomap->flags |= IOMAP_F_PAGE_CREATE;
+				flags &= ~IOMAP_UNCACHED;
+				goto retry;
+			}
 			break;
+		}
 
 		if (mapping_writably_mapped(inode->i_mapping))
 			flush_dcache_page(page);
@@ -882,10 +899,14 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+	unsigned flags = IOMAP_WRITE;
+
+	if (iocb->ki_flags & IOCB_UNCACHED)
+		flags |= IOMAP_UNCACHED;
 
 	while (iov_iter_count(iter)) {
-		ret = iomap_apply(inode, pos, iov_iter_count(iter),
-				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags,
+					ops, iter, iomap_write_actor);
 		if (ret <= 0)
 			break;
 		pos += ret;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 61fcaa3904d4..b5b5cf781eea 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -48,12 +48,16 @@  struct vm_fault;
  *
  * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
  * buffer heads for this mapping.
+ *
+ * IOMAP_F_PAGE_CREATE indicates that pages had to be allocated to satisfy
+ * this operation.
  */
 #define IOMAP_F_NEW		0x01
 #define IOMAP_F_DIRTY		0x02
 #define IOMAP_F_SHARED		0x04
 #define IOMAP_F_MERGED		0x08
 #define IOMAP_F_BUFFER_HEAD	0x10
+#define IOMAP_F_PAGE_CREATE	0x20
 
 /*
  * Flags set by the core iomap code during operations:
@@ -121,6 +125,7 @@  struct iomap_page_ops {
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
 #define IOMAP_NOWAIT		(1 << 5) /* do not block */
+#define IOMAP_UNCACHED		(1 << 6)
 
 struct iomap_ops {
 	/*