diff mbox

fs: Avoid invalidation in interrupt context in dio_complete()

Message ID 1508231484-30966-1-git-send-email-lczerner@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lukas Czerner Oct. 17, 2017, 9:11 a.m. UTC
Currently we try to defer completion of async DIO to the process context
in case there are any mapped pages associated with the inode so that we
can invalidate the pages when the IO completes. However the check is racy
and the pages can be mapped afterwards. If this happens we might end up
calling invalidate_inode_pages2_range() in dio_complete() in interrupt
context which could sleep. This can be reproduced by generic/451.

Fix this by passing the information whether we can or can't invalidate
to the dio_complete(). Thanks Eryu Guan for reporting this and Jan Kara
for suggesting a fix.

Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reported-by: Eryu Guan <eguan@redhat.com>
---
 fs/direct-io.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

Comments

Jan Kara Oct. 17, 2017, 9:53 a.m. UTC | #1
On Tue 17-10-17 11:11:24, Lukas Czerner wrote:
> Currently we try to defer completion of async DIO to the process context
> in case there are any mapped pages associated with the inode so that we
> can invalidate the pages when the IO completes. However the check is racy
> and the pages can be mapped afterwards. If this happens we might end up
> calling invalidate_inode_pages2_range() in dio_complete() in interrupt
> context which could sleep. This can be reproduced by generic/451.
> 
> Fix this by passing the information whether we can or can't invalidate
> to the dio_complete(). Thanks Eryu Guan for reporting this and Jan Kara
> for suggesting a fix.
> 
> Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Reported-by: Eryu Guan <eguan@redhat.com>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

BTW, I guess you should CC someone who can merge the patch...

								Honza

> ---
>  fs/direct-io.c | 19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 96415c6..5632548 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -45,6 +45,12 @@
>  #define DIO_PAGES	64
>  
>  /*
> + * Flags for dio_complete()
> + */
> +#define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */
> +#define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */
> +
> +/*
>   * This code generally works in units of "dio_blocks".  A dio_block is
>   * somewhere between the hard sector size and the filesystem block size.  it
>   * is determined on a per-invocation basis.   When talking to the filesystem
> @@ -225,7 +231,7 @@ static inline struct page *dio_get_page(struct dio *dio,
>   * filesystems can use it to hold additional state between get_block calls and
>   * dio_complete.
>   */
> -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> @@ -266,7 +272,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
>  	 * this invalidation fails, tough, the write still worked...
>  	 */
> -	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	if (flags & DIO_COMPLETE_INVALIDATE &&
> +	    ret > 0 && dio->op == REQ_OP_WRITE &&
>  	    dio->inode->i_mapping->nrpages) {
>  		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
>  					offset >> PAGE_SHIFT,
> @@ -285,7 +292,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
>  		inode_dio_end(dio->inode);
>  
> -	if (is_async) {
> +	if (flags & DIO_COMPLETE_ASYNC) {
>  		/*
>  		 * generic_write_sync expects ki_pos to have been updated
>  		 * already, but the submission path only does this for
> @@ -306,7 +313,7 @@ static void dio_aio_complete_work(struct work_struct *work)
>  {
>  	struct dio *dio = container_of(work, struct dio, complete_work);
>  
> -	dio_complete(dio, 0, true);
> +	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
>  }
>  
>  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
> @@ -348,7 +355,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
>  		} else {
> -			dio_complete(dio, 0, true);
> +			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
>  		}
>  	}
>  }
> @@ -1360,7 +1367,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  		dio_await_completion(dio);
>  
>  	if (drop_refcount(dio) == 0) {
> -		retval = dio_complete(dio, retval, false);
> +		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
>  	} else
>  		BUG_ON(retval != -EIOCBQUEUED);
>  
> -- 
> 2.7.5
>
Eryu Guan Oct. 17, 2017, 10:41 a.m. UTC | #2
On Tue, Oct 17, 2017 at 11:53:56AM +0200, Jan Kara wrote:
> On Tue 17-10-17 11:11:24, Lukas Czerner wrote:
> > Currently we try to defer completion of async DIO to the process context
> > in case there are any mapped pages associated with the inode so that we
> > can invalidate the pages when the IO completes. However the check is racy
> > and the pages can be mapped afterwards. If this happens we might end up
> > calling invalidate_inode_pages2_range() in dio_complete() in interrupt
> > context which could sleep. This can be reproduced by generic/451.
> > 
> > Fix this by passing the information whether we can or can't invalidate
> > to the dio_complete(). Thanks Eryu Guan for reporting this and Jan Kara
> > for suggesting a fix.
> > 
> > Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Reported-by: Eryu Guan <eguan@redhat.com>

generic/451 on ext4 survived 15 iterations with this patch applied on
top of v4.14-rc5.

Tested-by: Eryu Guan <eguan@redhat.com>

> 
> Looks good to me. You can add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> BTW, I guess you should CC someone who can merge the patch...
> 
> 								Honza

One problem is that this patch conflicts with my patch "fs: invalidate
page cache after end_io() in dio completion", I think it might be easier
for both patches go through the same tree.

My patch has been taken by Darrick via xfs tree, because usually iomap
changes go through xfs tree, I'm not sure if Darrick would like to take
this patch too, as there's nothing xfs or iomap related.. Anyway, add
Darrick to cc list too :)

Thanks,
Eryu

> 
> > ---
> >  fs/direct-io.c | 19 +++++++++++++------
> >  1 file changed, 13 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 96415c6..5632548 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -45,6 +45,12 @@
> >  #define DIO_PAGES	64
> >  
> >  /*
> > + * Flags for dio_complete()
> > + */
> > +#define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */
> > +#define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */
> > +
> > +/*
> >   * This code generally works in units of "dio_blocks".  A dio_block is
> >   * somewhere between the hard sector size and the filesystem block size.  it
> >   * is determined on a per-invocation basis.   When talking to the filesystem
> > @@ -225,7 +231,7 @@ static inline struct page *dio_get_page(struct dio *dio,
> >   * filesystems can use it to hold additional state between get_block calls and
> >   * dio_complete.
> >   */
> > -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> > +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
> >  {
> >  	loff_t offset = dio->iocb->ki_pos;
> >  	ssize_t transferred = 0;
> > @@ -266,7 +272,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> >  	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> >  	 * this invalidation fails, tough, the write still worked...
> >  	 */
> > -	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> > +	if (flags & DIO_COMPLETE_INVALIDATE &&
> > +	    ret > 0 && dio->op == REQ_OP_WRITE &&
> >  	    dio->inode->i_mapping->nrpages) {
> >  		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> >  					offset >> PAGE_SHIFT,
> > @@ -285,7 +292,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> >  	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
> >  		inode_dio_end(dio->inode);
> >  
> > -	if (is_async) {
> > +	if (flags & DIO_COMPLETE_ASYNC) {
> >  		/*
> >  		 * generic_write_sync expects ki_pos to have been updated
> >  		 * already, but the submission path only does this for
> > @@ -306,7 +313,7 @@ static void dio_aio_complete_work(struct work_struct *work)
> >  {
> >  	struct dio *dio = container_of(work, struct dio, complete_work);
> >  
> > -	dio_complete(dio, 0, true);
> > +	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
> >  }
> >  
> >  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
> > @@ -348,7 +355,7 @@ static void dio_bio_end_aio(struct bio *bio)
> >  			queue_work(dio->inode->i_sb->s_dio_done_wq,
> >  				   &dio->complete_work);
> >  		} else {
> > -			dio_complete(dio, 0, true);
> > +			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
> >  		}
> >  	}
> >  }
> > @@ -1360,7 +1367,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> >  		dio_await_completion(dio);
> >  
> >  	if (drop_refcount(dio) == 0) {
> > -		retval = dio_complete(dio, retval, false);
> > +		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
> >  	} else
> >  		BUG_ON(retval != -EIOCBQUEUED);
> >  
> > -- 
> > 2.7.5
> > 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Lukas Czerner Oct. 17, 2017, 10:44 a.m. UTC | #3
On Tue, Oct 17, 2017 at 11:53:56AM +0200, Jan Kara wrote:
> On Tue 17-10-17 11:11:24, Lukas Czerner wrote:
> > Currently we try to defer completion of async DIO to the process context
> > in case there are any mapped pages associated with the inode so that we
> > can invalidate the pages when the IO completes. However the check is racy
> > and the pages can be mapped afterwards. If this happens we might end up
> > calling invalidate_inode_pages2_range() in dio_complete() in interrupt
> > context which could sleep. This can be reproduced by generic/451.
> > 
> > Fix this by passing the information whether we can or can't invalidate
> > to the dio_complete(). Thanks Eryu Guan for reporting this and Jan Kara
> > for suggesting a fix.
> > 
> > Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Reported-by: Eryu Guan <eguan@redhat.com>
> 
> Looks good to me. You can add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> BTW, I guess you should CC someone who can merge the patch...

Jens, could you please take this through your tree ?

Thanks!
-Lukas

> 
> 								Honza
> 
> > ---
> >  fs/direct-io.c | 19 +++++++++++++------
> >  1 file changed, 13 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 96415c6..5632548 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -45,6 +45,12 @@
> >  #define DIO_PAGES	64
> >  
> >  /*
> > + * Flags for dio_complete()
> > + */
> > +#define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */
> > +#define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */
> > +
> > +/*
> >   * This code generally works in units of "dio_blocks".  A dio_block is
> >   * somewhere between the hard sector size and the filesystem block size.  it
> >   * is determined on a per-invocation basis.   When talking to the filesystem
> > @@ -225,7 +231,7 @@ static inline struct page *dio_get_page(struct dio *dio,
> >   * filesystems can use it to hold additional state between get_block calls and
> >   * dio_complete.
> >   */
> > -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> > +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
> >  {
> >  	loff_t offset = dio->iocb->ki_pos;
> >  	ssize_t transferred = 0;
> > @@ -266,7 +272,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> >  	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> >  	 * this invalidation fails, tough, the write still worked...
> >  	 */
> > -	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> > +	if (flags & DIO_COMPLETE_INVALIDATE &&
> > +	    ret > 0 && dio->op == REQ_OP_WRITE &&
> >  	    dio->inode->i_mapping->nrpages) {
> >  		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> >  					offset >> PAGE_SHIFT,
> > @@ -285,7 +292,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
> >  	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
> >  		inode_dio_end(dio->inode);
> >  
> > -	if (is_async) {
> > +	if (flags & DIO_COMPLETE_ASYNC) {
> >  		/*
> >  		 * generic_write_sync expects ki_pos to have been updated
> >  		 * already, but the submission path only does this for
> > @@ -306,7 +313,7 @@ static void dio_aio_complete_work(struct work_struct *work)
> >  {
> >  	struct dio *dio = container_of(work, struct dio, complete_work);
> >  
> > -	dio_complete(dio, 0, true);
> > +	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
> >  }
> >  
> >  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
> > @@ -348,7 +355,7 @@ static void dio_bio_end_aio(struct bio *bio)
> >  			queue_work(dio->inode->i_sb->s_dio_done_wq,
> >  				   &dio->complete_work);
> >  		} else {
> > -			dio_complete(dio, 0, true);
> > +			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
> >  		}
> >  	}
> >  }
> > @@ -1360,7 +1367,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> >  		dio_await_completion(dio);
> >  
> >  	if (drop_refcount(dio) == 0) {
> > -		retval = dio_complete(dio, retval, false);
> > +		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
> >  	} else
> >  		BUG_ON(retval != -EIOCBQUEUED);
> >  
> > -- 
> > 2.7.5
> > 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jens Axboe Oct. 17, 2017, 2:43 p.m. UTC | #4
On 10/17/2017 04:44 AM, Lukas Czerner wrote:
> 
> On Tue, Oct 17, 2017 at 11:53:56AM +0200, Jan Kara wrote:
>> On Tue 17-10-17 11:11:24, Lukas Czerner wrote:
>>> Currently we try to defer completion of async DIO to the process context
>>> in case there are any mapped pages associated with the inode so that we
>>> can invalidate the pages when the IO completes. However the check is racy
>>> and the pages can be mapped afterwards. If this happens we might end up
>>> calling invalidate_inode_pages2_range() in dio_complete() in interrupt
>>> context which could sleep. This can be reproduced by generic/451.
>>>
>>> Fix this by passing the information whether we can or can't invalidate
>>> to the dio_complete(). Thanks Eryu Guan for reporting this and Jan Kara
>>> for suggesting a fix.
>>>
>>> Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
>>> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
>>> Reported-by: Eryu Guan <eguan@redhat.com>
>>
>> Looks good to me. You can add:
>>
>> Reviewed-by: Jan Kara <jack@suse.cz>
>>
>> BTW, I guess you should CC someone who can merge the patch...
> 
> Jens, could you please take this through your tree ?

Will do, queued it up for 4.14.
diff mbox

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 96415c6..5632548 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -45,6 +45,12 @@ 
 #define DIO_PAGES	64
 
 /*
+ * Flags for dio_complete()
+ */
+#define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */
+#define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */
+
+/*
  * This code generally works in units of "dio_blocks".  A dio_block is
  * somewhere between the hard sector size and the filesystem block size.  it
  * is determined on a per-invocation basis.   When talking to the filesystem
@@ -225,7 +231,7 @@  static inline struct page *dio_get_page(struct dio *dio,
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 {
 	loff_t offset = dio->iocb->ki_pos;
 	ssize_t transferred = 0;
@@ -266,7 +272,8 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
 	 * this invalidation fails, tough, the write still worked...
 	 */
-	if (ret > 0 && dio->op == REQ_OP_WRITE &&
+	if (flags & DIO_COMPLETE_INVALIDATE &&
+	    ret > 0 && dio->op == REQ_OP_WRITE &&
 	    dio->inode->i_mapping->nrpages) {
 		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
 					offset >> PAGE_SHIFT,
@@ -285,7 +292,7 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
 		inode_dio_end(dio->inode);
 
-	if (is_async) {
+	if (flags & DIO_COMPLETE_ASYNC) {
 		/*
 		 * generic_write_sync expects ki_pos to have been updated
 		 * already, but the submission path only does this for
@@ -306,7 +313,7 @@  static void dio_aio_complete_work(struct work_struct *work)
 {
 	struct dio *dio = container_of(work, struct dio, complete_work);
 
-	dio_complete(dio, 0, true);
+	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
 }
 
 static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -348,7 +355,7 @@  static void dio_bio_end_aio(struct bio *bio)
 			queue_work(dio->inode->i_sb->s_dio_done_wq,
 				   &dio->complete_work);
 		} else {
-			dio_complete(dio, 0, true);
+			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
 		}
 	}
 }
@@ -1360,7 +1367,7 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		dio_await_completion(dio);
 
 	if (drop_refcount(dio) == 0) {
-		retval = dio_complete(dio, retval, false);
+		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
 	} else
 		BUG_ON(retval != -EIOCBQUEUED);