diff mbox

block, writeback: wait for writeback to finish before detaching wb

Message ID 20170309021937.27709-1-tahsin@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tahsin Erdogan March 9, 2017, 2:19 a.m. UTC
__blkdev_put() could surprise writeback thread by detaching the
wb object from an inode that hasn't cleared the I_SYNC flag yet.
This causes a NULL pointer dereference as seen below:

  BUG: unable to handle kernel NULL pointer dereference at (null)
  IP: locked_inode_to_wb_and_lock_list+0x38/0x440
  PGD 0
  Oops: 0000 [#1] SMP
  CPU: 0 PID: 34 Comm: kworker/u8:1 Not tainted 4.11.0-rc1+ #202
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Workqueue: writeback wb_workfn (flush-8:16)
  task: ffff88013aa780c0 task.stack: ffffc9000012c000
  RIP: 0010:locked_inode_to_wb_and_lock_list+0x38/0x440
  RSP: 0018:ffffc9000012fb70 EFLAGS: 00010202
  RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000018
  RDX: ffff88013aa780c0 RSI: ffff880139a478f8 RDI: ffff88013aa788b8
  RBP: ffffc9000012fba0 R08: 0000000000000001 R09: 0000000000000000
  R10: 00000000969da8e2 R11: 0000000000000000 R12: ffff880139a47858
  R13: ffff880139a478e0 R14: ffff880139a478f8 R15: ffff8801371f4058
  FS:  0000000000000000(0000) GS:ffff88013ae00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 0000000001012000 CR4: 00000000000006f0
  Call Trace:
   writeback_sb_inodes+0x3e1/0x7a0
   __writeback_inodes_wb+0x87/0xc0
   wb_writeback+0x2e7/0x5c0
   wb_workfn+0x2d1/0x9c0
   process_one_work+0x1d3/0x620
   worker_thread+0x126/0x4a0
   kthread+0x10a/0x140
   ret_from_fork+0x2e/0x40
  RIP: locked_inode_to_wb_and_lock_list+0x38/0x440 RSP: ffffc9000012fb70
  CR2: 0000000000000000
  ---[ end trace e0ea8a2695f4c86c ]---

Make __blkdev_put() wait for the I_SYNC flag to clear before detaching
wb.

Fixes: 43d1c0eb7e11 ("block: detach bdev inode from its wb in __blkdev_put()")
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/block_dev.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

Comments

Tejun Heo March 9, 2017, 6:26 p.m. UTC | #1
(cc'ing Jan and quoting the whole message)

On Wed, Mar 08, 2017 at 06:19:37PM -0800, Tahsin Erdogan wrote:
> __blkdev_put() could surprise writeback thread by detaching the
> wb object from an inode that hasn't cleared the I_SYNC flag yet.
> This causes a NULL pointer dereference as seen below:
> 
>   BUG: unable to handle kernel NULL pointer dereference at (null)
>   IP: locked_inode_to_wb_and_lock_list+0x38/0x440
>   PGD 0
>   Oops: 0000 [#1] SMP
>   CPU: 0 PID: 34 Comm: kworker/u8:1 Not tainted 4.11.0-rc1+ #202
>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>   Workqueue: writeback wb_workfn (flush-8:16)
>   task: ffff88013aa780c0 task.stack: ffffc9000012c000
>   RIP: 0010:locked_inode_to_wb_and_lock_list+0x38/0x440
>   RSP: 0018:ffffc9000012fb70 EFLAGS: 00010202
>   RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000018
>   RDX: ffff88013aa780c0 RSI: ffff880139a478f8 RDI: ffff88013aa788b8
>   RBP: ffffc9000012fba0 R08: 0000000000000001 R09: 0000000000000000
>   R10: 00000000969da8e2 R11: 0000000000000000 R12: ffff880139a47858
>   R13: ffff880139a478e0 R14: ffff880139a478f8 R15: ffff8801371f4058
>   FS:  0000000000000000(0000) GS:ffff88013ae00000(0000) knlGS:0000000000000000
>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000000000 CR3: 0000000001012000 CR4: 00000000000006f0
>   Call Trace:
>    writeback_sb_inodes+0x3e1/0x7a0
>    __writeback_inodes_wb+0x87/0xc0
>    wb_writeback+0x2e7/0x5c0
>    wb_workfn+0x2d1/0x9c0
>    process_one_work+0x1d3/0x620
>    worker_thread+0x126/0x4a0
>    kthread+0x10a/0x140
>    ret_from_fork+0x2e/0x40
>   RIP: locked_inode_to_wb_and_lock_list+0x38/0x440 RSP: ffffc9000012fb70
>   CR2: 0000000000000000
>   ---[ end trace e0ea8a2695f4c86c ]---
> 
> Make __blkdev_put() wait for the I_SYNC flag to clear before detaching
> wb.
> 
> Fixes: 43d1c0eb7e11 ("block: detach bdev inode from its wb in __blkdev_put()")
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
>  fs/block_dev.c | 11 +++++++----
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 2eca00ec4370..70fb82fcedd0 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -95,7 +95,7 @@ void kill_bdev(struct block_device *bdev)
>  
>  	invalidate_bh_lrus();
>  	truncate_inode_pages(mapping, 0);
> -}	
> +}
>  EXPORT_SYMBOL(kill_bdev);
>  
>  /* Invalidate clean unused buffers and pagecache. */
> @@ -617,13 +617,13 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
>  	inode_unlock(bd_inode);
>  	return retval;
>  }
> -	
> +
>  int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
>  {
>  	struct inode *bd_inode = bdev_file_inode(filp);
>  	struct block_device *bdev = I_BDEV(bd_inode);
>  	int error;
> -	
> +
>  	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
>  	if (error)
>  		return error;
> @@ -1038,7 +1038,7 @@ void bdput(struct block_device *bdev)
>  }
>  
>  EXPORT_SYMBOL(bdput);
> - 
> +

White line contaminations.

>  static struct block_device *bd_acquire(struct inode *inode)
>  {
>  	struct block_device *bdev;
> @@ -1880,7 +1880,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
>  		 * Detaching bdev inode from its wb in __destroy_inode()
>  		 * is too late: the queue which embeds its bdi (along with
>  		 * root wb) can be gone as soon as we put_disk() below.
> +		 * Before detaching wb, wait for any writeback activity for
> +		 * inode to settle.
>  		 */
> +		inode_wait_for_writeback(bdev->bd_inode);
>  		inode_detach_wb(bdev->bd_inode);
>  	}
>  	if (bdev->bd_contains == bdev) {

Given that there's write_inode_now(@sync == true) call right above,
I'm not sure how waiting for I_SYNC once more time would make a
difference.  Can you please explain how to trigger the issue?

Thanks.
Jan Kara March 9, 2017, 11:53 p.m. UTC | #2
On Thu 09-03-17 13:26:45, Tejun Heo wrote:
> (cc'ing Jan and quoting the whole message)
> 
> On Wed, Mar 08, 2017 at 06:19:37PM -0800, Tahsin Erdogan wrote:
> > __blkdev_put() could surprise writeback thread by detaching the
> > wb object from an inode that hasn't cleared the I_SYNC flag yet.
> > This causes a NULL pointer dereference as seen below:
> > 
> >   BUG: unable to handle kernel NULL pointer dereference at (null)
> >   IP: locked_inode_to_wb_and_lock_list+0x38/0x440
> >   PGD 0
> >   Oops: 0000 [#1] SMP
> >   CPU: 0 PID: 34 Comm: kworker/u8:1 Not tainted 4.11.0-rc1+ #202
> >   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> >   Workqueue: writeback wb_workfn (flush-8:16)
> >   task: ffff88013aa780c0 task.stack: ffffc9000012c000
> >   RIP: 0010:locked_inode_to_wb_and_lock_list+0x38/0x440
> >   RSP: 0018:ffffc9000012fb70 EFLAGS: 00010202
> >   RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000018
> >   RDX: ffff88013aa780c0 RSI: ffff880139a478f8 RDI: ffff88013aa788b8
> >   RBP: ffffc9000012fba0 R08: 0000000000000001 R09: 0000000000000000
> >   R10: 00000000969da8e2 R11: 0000000000000000 R12: ffff880139a47858
> >   R13: ffff880139a478e0 R14: ffff880139a478f8 R15: ffff8801371f4058
> >   FS:  0000000000000000(0000) GS:ffff88013ae00000(0000) knlGS:0000000000000000
> >   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >   CR2: 0000000000000000 CR3: 0000000001012000 CR4: 00000000000006f0
> >   Call Trace:
> >    writeback_sb_inodes+0x3e1/0x7a0
> >    __writeback_inodes_wb+0x87/0xc0
> >    wb_writeback+0x2e7/0x5c0
> >    wb_workfn+0x2d1/0x9c0
> >    process_one_work+0x1d3/0x620
> >    worker_thread+0x126/0x4a0
> >    kthread+0x10a/0x140
> >    ret_from_fork+0x2e/0x40
> >   RIP: locked_inode_to_wb_and_lock_list+0x38/0x440 RSP: ffffc9000012fb70
> >   CR2: 0000000000000000
> >   ---[ end trace e0ea8a2695f4c86c ]---
> > 
> > Make __blkdev_put() wait for the I_SYNC flag to clear before detaching
> > wb.
> > 
> > Fixes: 43d1c0eb7e11 ("block: detach bdev inode from its wb in __blkdev_put()")
> > Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> > ---
> >  fs/block_dev.c | 11 +++++++----
> >  1 file changed, 7 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > index 2eca00ec4370..70fb82fcedd0 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -95,7 +95,7 @@ void kill_bdev(struct block_device *bdev)
> >  
> >  	invalidate_bh_lrus();
> >  	truncate_inode_pages(mapping, 0);
> > -}	
> > +}
> >  EXPORT_SYMBOL(kill_bdev);
> >  
> >  /* Invalidate clean unused buffers and pagecache. */
> > @@ -617,13 +617,13 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
> >  	inode_unlock(bd_inode);
> >  	return retval;
> >  }
> > -	
> > +
> >  int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
> >  {
> >  	struct inode *bd_inode = bdev_file_inode(filp);
> >  	struct block_device *bdev = I_BDEV(bd_inode);
> >  	int error;
> > -	
> > +
> >  	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
> >  	if (error)
> >  		return error;
> > @@ -1038,7 +1038,7 @@ void bdput(struct block_device *bdev)
> >  }
> >  
> >  EXPORT_SYMBOL(bdput);
> > - 
> > +
> 
> White line contaminations.
> 
> >  static struct block_device *bd_acquire(struct inode *inode)
> >  {
> >  	struct block_device *bdev;
> > @@ -1880,7 +1880,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
> >  		 * Detaching bdev inode from its wb in __destroy_inode()
> >  		 * is too late: the queue which embeds its bdi (along with
> >  		 * root wb) can be gone as soon as we put_disk() below.
> > +		 * Before detaching wb, wait for any writeback activity for
> > +		 * inode to settle.
> >  		 */
> > +		inode_wait_for_writeback(bdev->bd_inode);
> >  		inode_detach_wb(bdev->bd_inode);
> >  	}
> >  	if (bdev->bd_contains == bdev) {
> 
> Given that there's write_inode_now(@sync == true) call right above,
> I'm not sure how waiting for I_SYNC once more time would make a
> difference.  Can you please explain how to trigger the issue?

Agreed that waiting for I_SYNC isn't going to cut it. Flusher thread (or
possibly some other process) can be running inode_to_wb() without I_SYNC
set. I have patches that fix this (remove inode_detach_wb() from
__blkdev_put()) - the ones you were reviewing lately. I'll post v4 next
week so that we can get those merged.

But if Tahsin has a reproducer, it would be nice. I was not able to hit the
race in my testing.

								Honza
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..70fb82fcedd0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -95,7 +95,7 @@  void kill_bdev(struct block_device *bdev)
 
 	invalidate_bh_lrus();
 	truncate_inode_pages(mapping, 0);
-}	
+}
 EXPORT_SYMBOL(kill_bdev);
 
 /* Invalidate clean unused buffers and pagecache. */
@@ -617,13 +617,13 @@  static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 	inode_unlock(bd_inode);
 	return retval;
 }
-	
+
 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *bd_inode = bdev_file_inode(filp);
 	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
-	
+
 	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
 	if (error)
 		return error;
@@ -1038,7 +1038,7 @@  void bdput(struct block_device *bdev)
 }
 
 EXPORT_SYMBOL(bdput);
- 
+
 static struct block_device *bd_acquire(struct inode *inode)
 {
 	struct block_device *bdev;
@@ -1880,7 +1880,10 @@  static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		 * Detaching bdev inode from its wb in __destroy_inode()
 		 * is too late: the queue which embeds its bdi (along with
 		 * root wb) can be gone as soon as we put_disk() below.
+		 * Before detaching wb, wait for any writeback activity for
+		 * inode to settle.
 		 */
+		inode_wait_for_writeback(bdev->bd_inode);
 		inode_detach_wb(bdev->bd_inode);
 	}
 	if (bdev->bd_contains == bdev) {