diff mbox

[v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO

Message ID 1500046823-25256-1-git-send-email-lczerner@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lukas Czerner July 14, 2017, 3:40 p.m. UTC
Currently when mixing buffered reads and asynchronous direct writes it
is possible to end up with the situation where we have stale data in the
page cache while the new data is already written to disk. This is
permanent until the affected pages are flushed away. Despite the fact
that mixing buffered and direct IO is ill-advised it does pose a thread
for a data integrity, is unexpected and should be fixed.

Fix this by deferring completion of asynchronous direct writes to a
process context in the case that there are mapped pages to be found in
the inode. Later before the completion in dio_complete() invalidate
the pages in question. This ensures that after the completion the pages
in the written area are either unmapped, or populated with up-to-date
data. Also do the same for the iomap case which uses
iomap_dio_complete() instead.

This has a side effect of deferring the completion to a process context
for every AIO DIO that happens on inode that has pages mapped. However
since the consensus is that this is ill-advised practice the performance
implication should not be a problem.

This was based on proposal from Jeff Moyer, thanks!

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
---
v2: Remove leftover ret variable from invalidate call in iomap_dio_complete

 fs/direct-io.c | 31 ++++++++++++++++++++++++++-----
 fs/iomap.c     |  7 +++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

Comments

Jan Kara July 17, 2017, 3:12 p.m. UTC | #1
On Fri 14-07-17 17:40:23, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

OK, this looks like it could work. Some comments below.

>  fs/direct-io.c | 31 ++++++++++++++++++++++++++-----
>  fs/iomap.c     |  7 +++++++
>  2 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..2db9ada 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -258,6 +258,11 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	if ((dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages))

Superfluous braces here... Also you should not call
invalidate_inode_pages2_range() in case of error I suppose.

> +		invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +
>  	if (dio->end_io) {
>  		int err;
>  
> @@ -304,6 +309,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +321,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1228,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq)
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);

Please add a comment explaining why sb_init_dio_done_wq() is needed here.

>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 1732228..3baeed2 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,15 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	loff_t offset = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	if ((dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)

Again I don't think you want to invalidate pages in case DIO failed with an
error...

> +		invalidate_inode_pages2_range(inode->i_mapping,
> +				offset >> PAGE_SHIFT,
> +				(offset + dio->size - 1) >> PAGE_SHIFT);
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,

								Honza
Lukas Czerner July 17, 2017, 3:28 p.m. UTC | #2
On Mon, Jul 17, 2017 at 05:12:28PM +0200, Jan Kara wrote:
> On Fri 14-07-17 17:40:23, Lukas Czerner wrote:
> > Currently when mixing buffered reads and asynchronous direct writes it
> > is possible to end up with the situation where we have stale data in the
> > page cache while the new data is already written to disk. This is
> > permanent until the affected pages are flushed away. Despite the fact
> > that mixing buffered and direct IO is ill-advised it does pose a thread
> > for a data integrity, is unexpected and should be fixed.
> > 
> > Fix this by deferring completion of asynchronous direct writes to a
> > process context in the case that there are mapped pages to be found in
> > the inode. Later before the completion in dio_complete() invalidate
> > the pages in question. This ensures that after the completion the pages
> > in the written area are either unmapped, or populated with up-to-date
> > data. Also do the same for the iomap case which uses
> > iomap_dio_complete() instead.
> > 
> > This has a side effect of deferring the completion to a process context
> > for every AIO DIO that happens on inode that has pages mapped. However
> > since the consensus is that this is ill-advised practice the performance
> > implication should not be a problem.
> > 
> > This was based on proposal from Jeff Moyer, thanks!
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Cc: Jeff Moyer <jmoyer@redhat.com>
> 
> OK, this looks like it could work. Some comments below.
> 
> >  fs/direct-io.c | 31 ++++++++++++++++++++++++++-----
> >  fs/iomap.c     |  7 +++++++
> >  2 files changed, 33 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 08cf278..2db9ada 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -258,6 +258,11 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> >  	if (ret == 0)
> >  		ret = transferred;
> >  
> > +	if ((dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages))
> 
> Superfluous braces here... Also you should not call
> invalidate_inode_pages2_range() in case of error I suppose.

Sure, I'll fix the braces.

About the error case, is it not possible that some data has already been
writtent to the disk despite the error ?

Thanks!
-Lukas

> 
> > +		invalidate_inode_pages2_range(dio->inode->i_mapping,
> > +					offset >> PAGE_SHIFT,
> > +					(offset + ret - 1) >> PAGE_SHIFT);
> > +
> >  	if (dio->end_io) {
> >  		int err;
> >  
> > @@ -304,6 +309,7 @@ static void dio_bio_end_aio(struct bio *bio)
> >  	struct dio *dio = bio->bi_private;
> >  	unsigned long remaining;
> >  	unsigned long flags;
> > +	bool defer_completion = false;
> >  
> >  	/* cleanup the bio */
> >  	dio_bio_complete(dio, bio);
> > @@ -315,7 +321,19 @@ static void dio_bio_end_aio(struct bio *bio)
> >  	spin_unlock_irqrestore(&dio->bio_lock, flags);
> >  
> >  	if (remaining == 0) {
> > -		if (dio->result && dio->defer_completion) {
> > +		/*
> > +		 * Defer completion when defer_completion is set or
> > +		 * when the inode has pages mapped and this is AIO write.
> > +		 * We need to invalidate those pages because there is a
> > +		 * chance they contain stale data in the case buffered IO
> > +		 * went in between AIO submission and completion into the
> > +		 * same region.
> > +		 */
> > +		if (dio->result)
> > +			defer_completion = dio->defer_completion ||
> > +					   (dio->op == REQ_OP_WRITE &&
> > +					    dio->inode->i_mapping->nrpages);
> > +		if (defer_completion) {
> >  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
> >  			queue_work(dio->inode->i_sb->s_dio_done_wq,
> >  				   &dio->complete_work);
> > @@ -1210,10 +1228,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> >  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
> >  	 * so that we can call ->fsync.
> >  	 */
> > -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> > -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> > -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> > -		retval = dio_set_defer_completion(dio);
> > +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> > +		retval = 0;
> > +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> > +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> > +			retval = dio_set_defer_completion(dio);
> > +		else if (!dio->inode->i_sb->s_dio_done_wq)
> > +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> 
> Please add a comment explaining why sb_init_dio_done_wq() is needed here.

ok, thanks.

> 
> >  		if (retval) {
> >  			/*
> >  			 * We grab i_mutex only for reads so we don't have
> > diff --git a/fs/iomap.c b/fs/iomap.c
> > index 1732228..3baeed2 100644
> > --- a/fs/iomap.c
> > +++ b/fs/iomap.c
> > @@ -713,8 +713,15 @@ struct iomap_dio {
> >  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> >  {
> >  	struct kiocb *iocb = dio->iocb;
> > +	loff_t offset = iocb->ki_pos;
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> >  	ssize_t ret;
> >  
> > +	if ((dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
> 
> Again I don't think you want to invalidate pages in case DIO failed with an
> error...
> 
> > +		invalidate_inode_pages2_range(inode->i_mapping,
> > +				offset >> PAGE_SHIFT,
> > +				(offset + dio->size - 1) >> PAGE_SHIFT);
> > +
> >  	if (dio->end_io) {
> >  		ret = dio->end_io(iocb,
> >  				dio->error ? dio->error : dio->size,
> 
> 								Honza
> 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jeff Moyer July 17, 2017, 3:39 p.m. UTC | #3
Lukas Czerner <lczerner@redhat.com> writes:

> About the error case, is it not possible that some data has already been
> writtent to the disk despite the error ?

Yes, it's possible.  However, that data is in an inconsistent state, so
it shouldn't be read, anyway.

Now, in the non-async path, we do the invalidation unconditionally, so I
could go either way on this.  I don't think it's going to matter for
performance or data integrity.

Cheers,
Jeff
Jan Kara July 17, 2017, 4:17 p.m. UTC | #4
On Mon 17-07-17 11:39:09, Jeff Moyer wrote:
> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > About the error case, is it not possible that some data has already been
> > writtent to the disk despite the error ?
> 
> Yes, it's possible.  However, that data is in an inconsistent state, so
> it shouldn't be read, anyway.
> 
> Now, in the non-async path, we do the invalidation unconditionally, so I
> could go either way on this.  I don't think it's going to matter for
> performance or data integrity.

Well, at least 'ret' would be negative in the error case so arguments
passed to invalidate_inode_pages2_range() would be bogus if I'm reading the
code right...

								Honza
Jeff Moyer July 17, 2017, 7:52 p.m. UTC | #5
Jan Kara <jack@suse.cz> writes:

> On Mon 17-07-17 11:39:09, Jeff Moyer wrote:
>> Lukas Czerner <lczerner@redhat.com> writes:
>> 
>> > About the error case, is it not possible that some data has already been
>> > writtent to the disk despite the error ?
>> 
>> Yes, it's possible.  However, that data is in an inconsistent state, so
>> it shouldn't be read, anyway.
>> 
>> Now, in the non-async path, we do the invalidation unconditionally, so I
>> could go either way on this.  I don't think it's going to matter for
>> performance or data integrity.
>
> Well, at least 'ret' would be negative in the error case so arguments
> passed to invalidate_inode_pages2_range() would be bogus if I'm reading the
> code right...

Ah, yes.  Sorry, I was commenting on the more general point.  You are
correct, in dio_complete, ret could be set to dio->page_errors or
dio->io_error.  So yes, that needs to be checked.

-Jeff
Lukas Czerner July 18, 2017, 7:39 a.m. UTC | #6
On Mon, Jul 17, 2017 at 11:39:09AM -0400, Jeff Moyer wrote:
> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > About the error case, is it not possible that some data has already been
> > writtent to the disk despite the error ?
> 
> Yes, it's possible.  However, that data is in an inconsistent state, so
> it shouldn't be read, anyway.

I think it can be read if we wrote into already allocated space.

> 
> Now, in the non-async path, we do the invalidation unconditionally, so I
> could go either way on this.  I don't think it's going to matter for
> performance or data integrity.

That's part of the reason why I did it unconditionaly as well, however
Jan is right that ret would be negative. The way to fix it would differ
depending on whether I am right about reading partially written data
from AIO that failed. We still want to invalidate in that case.

-Lukas

> 
> Cheers,
> Jeff
Jan Kara July 18, 2017, 9:06 a.m. UTC | #7
On Tue 18-07-17 09:39:35, Lukas Czerner wrote:
> On Mon, Jul 17, 2017 at 11:39:09AM -0400, Jeff Moyer wrote:
> > Lukas Czerner <lczerner@redhat.com> writes:
> > 
> > > About the error case, is it not possible that some data has already been
> > > writtent to the disk despite the error ?
> > 
> > Yes, it's possible.  However, that data is in an inconsistent state, so
> > it shouldn't be read, anyway.
> 
> I think it can be read if we wrote into already allocated space.
> 
> > 
> > Now, in the non-async path, we do the invalidation unconditionally, so I
> > could go either way on this.  I don't think it's going to matter for
> > performance or data integrity.
> 
> That's part of the reason why I did it unconditionaly as well, however
> Jan is right that ret would be negative. The way to fix it would differ
> depending on whether I am right about reading partially written data
> from AIO that failed. We still want to invalidate in that case.

Frankly, I don't think it really matters so I'd go for not invalidating
anything on error just out of philosophy: "There's something weird going
on, bail out as quickly as you can."

								Honza
Lukas Czerner July 18, 2017, 9:32 a.m. UTC | #8
On Tue, Jul 18, 2017 at 11:06:26AM +0200, Jan Kara wrote:
> On Tue 18-07-17 09:39:35, Lukas Czerner wrote:
> > On Mon, Jul 17, 2017 at 11:39:09AM -0400, Jeff Moyer wrote:
> > > Lukas Czerner <lczerner@redhat.com> writes:
> > > 
> > > > About the error case, is it not possible that some data has already been
> > > > writtent to the disk despite the error ?
> > > 
> > > Yes, it's possible.  However, that data is in an inconsistent state, so
> > > it shouldn't be read, anyway.
> > 
> > I think it can be read if we wrote into already allocated space.
> > 
> > > 
> > > Now, in the non-async path, we do the invalidation unconditionally, so I
> > > could go either way on this.  I don't think it's going to matter for
> > > performance or data integrity.
> > 
> > That's part of the reason why I did it unconditionaly as well, however
> > Jan is right that ret would be negative. The way to fix it would differ
> > depending on whether I am right about reading partially written data
> > from AIO that failed. We still want to invalidate in that case.
> 
> Frankly, I don't think it really matters so I'd go for not invalidating
> anything on error just out of philosophy: "There's something weird going
> on, bail out as quickly as you can."
> 
> 								Honza

Fair enough, thanks!

-Lukas

> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
diff mbox

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 08cf278..2db9ada 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -258,6 +258,11 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (ret == 0)
 		ret = transferred;
 
+	if ((dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages))
+		invalidate_inode_pages2_range(dio->inode->i_mapping,
+					offset >> PAGE_SHIFT,
+					(offset + ret - 1) >> PAGE_SHIFT);
+
 	if (dio->end_io) {
 		int err;
 
@@ -304,6 +309,7 @@  static void dio_bio_end_aio(struct bio *bio)
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
 	unsigned long flags;
+	bool defer_completion = false;
 
 	/* cleanup the bio */
 	dio_bio_complete(dio, bio);
@@ -315,7 +321,19 @@  static void dio_bio_end_aio(struct bio *bio)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		if (dio->result && dio->defer_completion) {
+		/*
+		 * Defer completion when defer_completion is set or
+		 * when the inode has pages mapped and this is AIO write.
+		 * We need to invalidate those pages because there is a
+		 * chance they contain stale data in the case buffered IO
+		 * went in between AIO submission and completion into the
+		 * same region.
+		 */
+		if (dio->result)
+			defer_completion = dio->defer_completion ||
+					   (dio->op == REQ_OP_WRITE &&
+					    dio->inode->i_mapping->nrpages);
+		if (defer_completion) {
 			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
 			queue_work(dio->inode->i_sb->s_dio_done_wq,
 				   &dio->complete_work);
@@ -1210,10 +1228,13 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
 	 * so that we can call ->fsync.
 	 */
-	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
-	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
-	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
-		retval = dio_set_defer_completion(dio);
+	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+		retval = 0;
+		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+		    IS_SYNC(iocb->ki_filp->f_mapping->host))
+			retval = dio_set_defer_completion(dio);
+		else if (!dio->inode->i_sb->s_dio_done_wq)
+			retval = sb_init_dio_done_wq(dio->inode->i_sb);
 		if (retval) {
 			/*
 			 * We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c
index 1732228..3baeed2 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,15 @@  struct iomap_dio {
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
 	struct kiocb *iocb = dio->iocb;
+	loff_t offset = iocb->ki_pos;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
+	if ((dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages)
+		invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+
 	if (dio->end_io) {
 		ret = dio->end_io(iocb,
 				dio->error ? dio->error : dio->size,