diff mbox

[v5] fs: Fix page cache inconsistency when mixing buffered and AIO DIO

Message ID 1500463692-4982-1-git-send-email-lczerner@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lukas Czerner July 19, 2017, 11:28 a.m. UTC
Currently when mixing buffered reads and asynchronous direct writes it
is possible to end up with the situation where we have stale data in the
page cache while the new data is already written to disk. This is
permanent until the affected pages are flushed away. Despite the fact
that mixing buffered and direct IO is ill-advised it does pose a thread
for a data integrity, is unexpected and should be fixed.

Fix this by deferring completion of asynchronous direct writes to a
process context in the case that there are mapped pages to be found in
the inode. Later before the completion in dio_complete() invalidate
the pages in question. This ensures that after the completion the pages
in the written area are either unmapped, or populated with up-to-date
data. Also do the same for the iomap case which uses
iomap_dio_complete() instead.

This has a side effect of deferring the completion to a process context
for every AIO DIO that happens on inode that has pages mapped. However
since the consensus is that this is ill-advised practice the performance
implication should not be a problem.

This was based on proposal from Jeff Moyer, thanks!

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
---
v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
v3: Do not invalidate in case of error. Add some coments
v4: Remove unnecessary variable, remove unnecessary inner braces
v5: Style changes

 fs/direct-io.c | 39 ++++++++++++++++++++++++++++++++++-----
 fs/iomap.c     |  7 +++++++
 2 files changed, 41 insertions(+), 5 deletions(-)

Comments

Jan Kara July 19, 2017, 11:37 a.m. UTC | #1
On Wed 19-07-17 13:28:12, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

You forgot to add my Reviewed-by tag. So feel free to add it now:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> 
>  fs/direct-io.c | 39 ++++++++++++++++++++++++++++++++++-----
>  fs/iomap.c     |  7 +++++++
>  2 files changed, 41 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..0d1befd 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -258,6 +258,13 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +	}
> +
>  	if (dio->end_io) {
>  		int err;
>  
> @@ -304,6 +311,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +323,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1230,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 1732228..144512e 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,15 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> +		invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> -- 
> 2.7.5
>
Jeff Moyer July 19, 2017, 12:17 p.m. UTC | #2
Lukas Czerner <lczerner@redhat.com> writes:

> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
>
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
>
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
>
> This was based on proposal from Jeff Moyer, thanks!
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

Looks good, Lukas!  Thanks!

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>


> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
>
>  fs/direct-io.c | 39 ++++++++++++++++++++++++++++++++++-----
>  fs/iomap.c     |  7 +++++++
>  2 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..0d1befd 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -258,6 +258,13 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +	}
> +
>  	if (dio->end_io) {
>  		int err;
>  
> @@ -304,6 +311,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +323,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1230,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 1732228..144512e 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,15 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> +		invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
Jeff Moyer Aug. 3, 2017, 6:10 p.m. UTC | #3
Al, would you mind taking this in through your tree?  It's been reviewed
by myself and Jan in this mail thread.

Thanks!
Jeff

Lukas Czerner <lczerner@redhat.com> writes:

> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
>
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
>
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
>
> This was based on proposal from Jeff Moyer, thanks!
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
>
>  fs/direct-io.c | 39 ++++++++++++++++++++++++++++++++++-----
>  fs/iomap.c     |  7 +++++++
>  2 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..0d1befd 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -258,6 +258,13 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +	}
> +
>  	if (dio->end_io) {
>  		int err;
>  
> @@ -304,6 +311,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +323,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1230,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 1732228..144512e 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,15 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> +		invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
Dave Chinner Aug. 4, 2017, 10:09 a.m. UTC | #4
On Thu, Aug 03, 2017 at 02:10:47PM -0400, Jeff Moyer wrote:
> Al, would you mind taking this in through your tree?  It's been reviewed
> by myself and Jan in this mail thread.

Still needs more fixing, I think?

Sorry, this is the first time I've seen this patch....

> > diff --git a/fs/iomap.c b/fs/iomap.c
> > index 1732228..144512e 100644
> > --- a/fs/iomap.c
> > +++ b/fs/iomap.c
> > @@ -713,8 +713,15 @@ struct iomap_dio {
> >  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> >  {
> >  	struct kiocb *iocb = dio->iocb;
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> >  	ssize_t ret;
> >  
> > +	if (!dio->error &&
> > +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> > +		invalidate_inode_pages2_range(inode->i_mapping,
> > +				iocb->ki_pos >> PAGE_SHIFT,
> > +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> > +
> >  	if (dio->end_io) {
> >  		ret = dio->end_io(iocb,
> >  				dio->error ? dio->error : dio->size,
> 

This invalidation is already run in iomap_dio_rw() for the sync IO
case directly after the call to iomap_dio_complete().  It also has a
comment to explain exactly why the the invalidation is needed, and
it issues a warning to dmesg if the invalidation fails to indicate
the reason why the user is reporting data corruption to us. i.e.:

.....
        ret = iomap_dio_complete(dio);

        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
         * of the write was an mmap'ed region of the file we're writing.  Either
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
         */
        if (iov_iter_rw(iter) == WRITE) {
                int err = invalidate_inode_pages2_range(mapping,
                                start >> PAGE_SHIFT, end >> PAGE_SHIFT);
                WARN_ON_ONCE(err);
        }

        return ret;

If we're going to replace this with an invalidation in
iomap_dio_complete() so it also handles the AIO path, then the
comment and warning on invalidation failure also need to be moved to
iomap_dio_complete() and the duplicate code removed from
iomap_dio_rw()...

Cheers,

Dave.
Jeff Moyer Aug. 7, 2017, 3:52 p.m. UTC | #5
Dave Chinner <david@fromorbit.com> writes:

> On Thu, Aug 03, 2017 at 02:10:47PM -0400, Jeff Moyer wrote:
>> Al, would you mind taking this in through your tree?  It's been reviewed
>> by myself and Jan in this mail thread.
>
> Still needs more fixing, I think?
>
> Sorry, this is the first time I've seen this patch....
>
>> > diff --git a/fs/iomap.c b/fs/iomap.c
>> > index 1732228..144512e 100644
>> > --- a/fs/iomap.c
>> > +++ b/fs/iomap.c
>> > @@ -713,8 +713,15 @@ struct iomap_dio {
>> >  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>> >  {
>> >  	struct kiocb *iocb = dio->iocb;
>> > +	struct inode *inode = file_inode(iocb->ki_filp);
>> >  	ssize_t ret;
>> >  
>> > +	if (!dio->error &&
>> > +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
>> > +		invalidate_inode_pages2_range(inode->i_mapping,
>> > +				iocb->ki_pos >> PAGE_SHIFT,
>> > +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
>> > +
>> >  	if (dio->end_io) {
>> >  		ret = dio->end_io(iocb,
>> >  				dio->error ? dio->error : dio->size,
>> 
>
> This invalidation is already run in iomap_dio_rw() for the sync IO
> case directly after the call to iomap_dio_complete().  It also has a
> comment to explain exactly why the the invalidation is needed, and
> it issues a warning to dmesg if the invalidation fails to indicate
> the reason why the user is reporting data corruption to us. i.e.:
>
> .....
>         ret = iomap_dio_complete(dio);
>
>         /*
>          * Try again to invalidate clean pages which might have been cached by
>          * non-direct readahead, or faulted in by get_user_pages() if the source
>          * of the write was an mmap'ed region of the file we're writing.  Either
>          * one is a pretty crazy thing to do, so we don't support it 100%.  If
>          * this invalidation fails, tough, the write still worked...
>          */
>         if (iov_iter_rw(iter) == WRITE) {
>                 int err = invalidate_inode_pages2_range(mapping,
>                                 start >> PAGE_SHIFT, end >> PAGE_SHIFT);
>                 WARN_ON_ONCE(err);
>         }
>
>         return ret;
>
> If we're going to replace this with an invalidation in
> iomap_dio_complete() so it also handles the AIO path, then the
> comment and warning on invalidation failure also need to be moved to
> iomap_dio_complete() and the duplicate code removed from
> iomap_dio_rw()...

Yep, good catch.  Lukas, care to respin?

-Jeff
Lukas Czerner Aug. 8, 2017, 8:41 a.m. UTC | #6
On Mon, Aug 07, 2017 at 11:52:45AM -0400, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > On Thu, Aug 03, 2017 at 02:10:47PM -0400, Jeff Moyer wrote:
> >> Al, would you mind taking this in through your tree?  It's been reviewed
> >> by myself and Jan in this mail thread.
> >
> > Still needs more fixing, I think?
> >
> > Sorry, this is the first time I've seen this patch....
> >
> >> > diff --git a/fs/iomap.c b/fs/iomap.c
> >> > index 1732228..144512e 100644
> >> > --- a/fs/iomap.c
> >> > +++ b/fs/iomap.c
> >> > @@ -713,8 +713,15 @@ struct iomap_dio {
> >> >  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> >> >  {
> >> >  	struct kiocb *iocb = dio->iocb;
> >> > +	struct inode *inode = file_inode(iocb->ki_filp);
> >> >  	ssize_t ret;
> >> >  
> >> > +	if (!dio->error &&
> >> > +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> >> > +		invalidate_inode_pages2_range(inode->i_mapping,
> >> > +				iocb->ki_pos >> PAGE_SHIFT,
> >> > +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> >> > +
> >> >  	if (dio->end_io) {
> >> >  		ret = dio->end_io(iocb,
> >> >  				dio->error ? dio->error : dio->size,
> >> 
> >
> > This invalidation is already run in iomap_dio_rw() for the sync IO
> > case directly after the call to iomap_dio_complete().  It also has a
> > comment to explain exactly why the the invalidation is needed, and
> > it issues a warning to dmesg if the invalidation fails to indicate
> > the reason why the user is reporting data corruption to us. i.e.:
> >
> > .....
> >         ret = iomap_dio_complete(dio);
> >
> >         /*
> >          * Try again to invalidate clean pages which might have been cached by
> >          * non-direct readahead, or faulted in by get_user_pages() if the source
> >          * of the write was an mmap'ed region of the file we're writing.  Either
> >          * one is a pretty crazy thing to do, so we don't support it 100%.  If
> >          * this invalidation fails, tough, the write still worked...
> >          */
> >         if (iov_iter_rw(iter) == WRITE) {
> >                 int err = invalidate_inode_pages2_range(mapping,
> >                                 start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> >                 WARN_ON_ONCE(err);
> >         }
> >
> >         return ret;
> >
> > If we're going to replace this with an invalidation in
> > iomap_dio_complete() so it also handles the AIO path, then the
> > comment and warning on invalidation failure also need to be moved to
> > iomap_dio_complete() and the duplicate code removed from
> > iomap_dio_rw()...
> 
> Yep, good catch.  Lukas, care to respin?

Of course, I'll respin.

-Lukas

> 
> -Jeff
diff mbox

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 08cf278..0d1befd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -258,6 +258,13 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (ret == 0)
 		ret = transferred;
 
+	if (ret > 0 && dio->op == REQ_OP_WRITE &&
+	    dio->inode->i_mapping->nrpages) {
+		invalidate_inode_pages2_range(dio->inode->i_mapping,
+					offset >> PAGE_SHIFT,
+					(offset + ret - 1) >> PAGE_SHIFT);
+	}
+
 	if (dio->end_io) {
 		int err;
 
@@ -304,6 +311,7 @@  static void dio_bio_end_aio(struct bio *bio)
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
 	unsigned long flags;
+	bool defer_completion = false;
 
 	/* cleanup the bio */
 	dio_bio_complete(dio, bio);
@@ -315,7 +323,19 @@  static void dio_bio_end_aio(struct bio *bio)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		if (dio->result && dio->defer_completion) {
+		/*
+		 * Defer completion when defer_completion is set or
+		 * when the inode has pages mapped and this is AIO write.
+		 * We need to invalidate those pages because there is a
+		 * chance they contain stale data in the case buffered IO
+		 * went in between AIO submission and completion into the
+		 * same region.
+		 */
+		if (dio->result)
+			defer_completion = dio->defer_completion ||
+					   (dio->op == REQ_OP_WRITE &&
+					    dio->inode->i_mapping->nrpages);
+		if (defer_completion) {
 			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
 			queue_work(dio->inode->i_sb->s_dio_done_wq,
 				   &dio->complete_work);
@@ -1210,10 +1230,19 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
 	 * so that we can call ->fsync.
 	 */
-	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
-	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
-	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
-		retval = dio_set_defer_completion(dio);
+	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+		retval = 0;
+		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+		    IS_SYNC(iocb->ki_filp->f_mapping->host))
+			retval = dio_set_defer_completion(dio);
+		else if (!dio->inode->i_sb->s_dio_done_wq) {
+			/*
+			 * In case of AIO write racing with buffered read we
+			 * need to defer completion. We can't decide this now,
+			 * however the workqueue needs to be initialized here.
+			 */
+			retval = sb_init_dio_done_wq(dio->inode->i_sb);
+		}
 		if (retval) {
 			/*
 			 * We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c
index 1732228..144512e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,15 @@  struct iomap_dio {
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
 	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
+		invalidate_inode_pages2_range(inode->i_mapping,
+				iocb->ki_pos >> PAGE_SHIFT,
+				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
+
 	if (dio->end_io) {
 		ret = dio->end_io(iocb,
 				dio->error ? dio->error : dio->size,