diff mbox

Btrfs: fix deadlock between dedup on same file and starting writeback

Message ID 1487697292-10015-1-git-send-email-fdmanana@kernel.org (mailing list archive)
State Accepted
Headers show

Commit Message

Filipe Manana Feb. 21, 2017, 5:14 p.m. UTC
From: Filipe Manana <fdmanana@suse.com>

If we are deduping two ranges of the same file we need to make sure that
we lock all pages in ascending order, that is, lock first the pages from
the range with lower offset and then the pages from the other range, as
otherwise we can deadlock with a concurrent task that is starting delalloc
(writeback). Example trace:

[74073.052218] INFO: task kworker/u32:10:17997 blocked for more than 120 seconds.
[74073.053889]       Tainted: G        W       4.9.0-rc7-btrfs-next-36+ #1
[74073.055071] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[74073.056696] kworker/u32:10  D    0 17997      2 0x00000000
[74073.058606] Workqueue: writeback wb_workfn (flush-btrfs-53176)
[74073.061370]  ffff880031e79858 ffff8802159d2580 ffff880237004580 ffff880031e79240
[74073.064784]  ffff88023f4978c0 ffffc9000817b638 ffffffff814c15e1 0000000000000000
[74073.068386]  ffff88023f4978d8 ffff88023f4978c0 000000000017b620 ffff880031e79240
[74073.071712] Call Trace:
[74073.072884]  [<ffffffff814c15e1>] ? __schedule+0x48f/0x6f4
[74073.075395]  [<ffffffff814c1c8b>] ? bit_wait+0x2f/0x2f
[74073.077511]  [<ffffffff814c18d2>] schedule+0x8c/0xa0
[74073.079440]  [<ffffffff814c4b36>] schedule_timeout+0x43/0xff
[74073.081637]  [<ffffffff8110953e>] ? time_hardirqs_on+0x9/0x14
[74073.083809]  [<ffffffff81095c67>] ? trace_hardirqs_on_caller+0x16/0x197
[74073.086314]  [<ffffffff810bde98>] ? timekeeping_get_ns+0x1e/0x32
[74073.100654]  [<ffffffff810be048>] ? ktime_get+0x41/0x52
[74073.102619]  [<ffffffff814c10f0>] io_schedule_timeout+0xa0/0x102
[74073.104771]  [<ffffffff814c10f0>] ? io_schedule_timeout+0xa0/0x102
[74073.106969]  [<ffffffff814c1ca6>] bit_wait_io+0x1b/0x39
[74073.108954]  [<ffffffff814c1fb8>] __wait_on_bit_lock+0x4f/0x99
[74073.110981]  [<ffffffff8112b692>] __lock_page+0x6b/0x6d
[74073.112833]  [<ffffffff8108ceb4>] ? autoremove_wake_function+0x3a/0x3a
[74073.115010]  [<ffffffffa031178b>] lock_page+0x2f/0x32 [btrfs]
[74073.116999]  [<ffffffffa0311d9f>] lock_delalloc_pages+0xc7/0x1a0 [btrfs]
[74073.119243]  [<ffffffffa0313d15>] find_lock_delalloc_range+0xc3/0x1a4 [btrfs]
[74073.121636]  [<ffffffffa0313e81>] writepage_delalloc.isra.31+0x8b/0x134 [btrfs]
[74073.124229]  [<ffffffffa0315d69>] __extent_writepage+0x1c1/0x2bf [btrfs]
[74073.126372]  [<ffffffffa03160f2>] extent_write_cache_pages.isra.30.constprop.49+0x28b/0x36c [btrfs]
[74073.129371]  [<ffffffffa03165b9>] extent_writepages+0x4b/0x5c [btrfs]
[74073.131440]  [<ffffffffa02fcb59>] ? insert_reserved_file_extent.constprop.42+0x261/0x261 [btrfs]
[74073.134303]  [<ffffffff811b4ce4>] ? writeback_sb_inodes+0xe0/0x4a1
[74073.136298]  [<ffffffffa02fab7f>] btrfs_writepages+0x28/0x2a [btrfs]
[74073.138248]  [<ffffffff81138200>] do_writepages+0x23/0x2c
[74073.139910]  [<ffffffff811b3cab>] __writeback_single_inode+0x105/0x6d2
[74073.142003]  [<ffffffff811b4e96>] writeback_sb_inodes+0x292/0x4a1
[74073.136298]  [<ffffffffa02fab7f>] btrfs_writepages+0x28/0x2a [btrfs]
[74073.138248]  [<ffffffff81138200>] do_writepages+0x23/0x2c
[74073.139910]  [<ffffffff811b3cab>] __writeback_single_inode+0x105/0x6d2
[74073.142003]  [<ffffffff811b4e96>] writeback_sb_inodes+0x292/0x4a1
[74073.143911]  [<ffffffff811b511b>] __writeback_inodes_wb+0x76/0xae
[74073.145787]  [<ffffffff811b53ca>] wb_writeback+0x1cc/0x4d7
[74073.147452]  [<ffffffff811b60cd>] wb_workfn+0x194/0x37d
[74073.149084]  [<ffffffff811b60cd>] ? wb_workfn+0x194/0x37d
[74073.150726]  [<ffffffff8106ce77>] ? process_one_work+0x154/0x4e4
[74073.152694]  [<ffffffff8106cf96>] process_one_work+0x273/0x4e4
[74073.154452]  [<ffffffff8106d6db>] worker_thread+0x1eb/0x2ca
[74073.156138]  [<ffffffff8106d4f0>] ? rescuer_thread+0x2b6/0x2b6
[74073.157837]  [<ffffffff81072a81>] kthread+0xd5/0xdd
[74073.159339]  [<ffffffff810729ac>] ? __kthread_unpark+0x5a/0x5a
[74073.161088]  [<ffffffff814c6257>] ret_from_fork+0x27/0x40
[74073.162680] INFO: lockdep is turned off.
[74073.163855] INFO: task do-dedup:30264 blocked for more than 120 seconds.
[74073.181180]       Tainted: G        W       4.9.0-rc7-btrfs-next-36+ #1
[74073.181180] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[74073.185296] fdm-stress      D    0 30264  29974 0x00000000
[74073.186810]  ffff880089595118 ffff880211b8eac0 ffff880237030380 ffff880089594b00
[74073.188998]  ffff88023f2978c0 ffffc900063abb68 ffffffff814c15e1 0000000000000000
[74073.191070]  ffff88023f2978d8 ffff88023f2978c0 00000000003abb50 ffff880089594b00
[74073.193286] Call Trace:
[74073.193990]  [<ffffffff814c15e1>] ? __schedule+0x48f/0x6f4
[74073.195418]  [<ffffffff814c1c8b>] ? bit_wait+0x2f/0x2f
[74073.196796]  [<ffffffff814c18d2>] schedule+0x8c/0xa0
[74073.198163]  [<ffffffff814c4b36>] schedule_timeout+0x43/0xff
[74073.199621]  [<ffffffff81095df5>] ? trace_hardirqs_on+0xd/0xf
[74073.201100]  [<ffffffff810bde98>] ? timekeeping_get_ns+0x1e/0x32
[74073.202686]  [<ffffffff810be048>] ? ktime_get+0x41/0x52
[74073.204051]  [<ffffffff814c10f0>] io_schedule_timeout+0xa0/0x102
[74073.205585]  [<ffffffff814c10f0>] ? io_schedule_timeout+0xa0/0x102
[74073.207123]  [<ffffffff814c1ca6>] bit_wait_io+0x1b/0x39
[74073.208238]  [<ffffffff814c1fb8>] __wait_on_bit_lock+0x4f/0x99
[74073.208871]  [<ffffffff8112b692>] __lock_page+0x6b/0x6d
[74073.209430]  [<ffffffff8108ceb4>] ? autoremove_wake_function+0x3a/0x3a
[74073.210101]  [<ffffffff8112b800>] lock_page+0x2f/0x32
[74073.210636]  [<ffffffff8112c502>] pagecache_get_page+0x5e/0x153
[74073.211270]  [<ffffffffa03257eb>] gather_extent_pages+0x4e/0x109 [btrfs]
[74073.212166]  [<ffffffffa032a04c>] btrfs_dedupe_file_range+0x1e1/0x4dd [btrfs]
[74073.213257]  [<ffffffff8118d9b5>] vfs_dedupe_file_range+0x1c1/0x221
[74073.214086]  [<ffffffff8119e0c4>] do_vfs_ioctl+0x442/0x600
[74073.214767]  [<ffffffff811a7874>] ? rcu_read_unlock+0x5b/0x5d
[74073.215619]  [<ffffffff811a7953>] ? __fget+0x6b/0x77
[74073.216338]  [<ffffffff8119e2d9>] SyS_ioctl+0x57/0x79
[74073.217149]  [<ffffffff814c5fea>] entry_SYSCALL_64_fastpath+0x18/0xad
[74073.218102]  [<ffffffff81109552>] ? time_hardirqs_off+0x9/0x14
[74073.218968]  [<ffffffff810938ce>] ? trace_hardirqs_off_caller+0x1f/0xaa
[74073.219938] INFO: lockdep is turned off.

What happened was the following:

      CPU 1                                       CPU 2

                                             btrfs_dedupe_file_range()
                                               --> using same inode as source
                                                   and target
                                               --> src range is [768K, 1Mb[
                                               --> dst range is [0, 256K[
                                              btrfs_cmp_data_prepare()
                                               --> calls gather_extent_pages()
                                                   for range [768K, 1Mb[ and
                                                   locks all pages in that range

 do_writepages()
  btrfs_writepages()
   extent_writepages()
    extent_write_cache_pages()
     __extent_writepage()
      writepage_delalloc()
       find_lock_delalloc_range()
         --> finds range [0, 1Mb[
         lock_delalloc_pages()
          --> locks all pages in the
              range [0, 768K[
          --> tries to lock page at
              offset 768K
                --> deadlock

                                               --> calls gather_extent_pages()
                                                   to lock pages in the range
                                                   [0, 256K[
                                                    --> deadlock, task at CPU 1
                                                        already locked that
                                                        range and it's trying
                                                        to lock the range we
                                                        locked previously

So fix this by making sure that during a dedup we always lock first the
pages from the range with lower offset.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ioctl.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Liu Bo Feb. 24, 2017, 8:17 p.m. UTC | #1
On Tue, Feb 21, 2017 at 05:14:52PM +0000, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> If we are deduping two ranges of the same file we need to make sure that
> we lock all pages in ascending order, that is, lock first the pages from
> the range with lower offset and then the pages from the other range, as
> otherwise we can deadlock with a concurrent task that is starting delalloc
> (writeback). Example trace:
> 
> [74073.052218] INFO: task kworker/u32:10:17997 blocked for more than 120 seconds.
> [74073.053889]       Tainted: G        W       4.9.0-rc7-btrfs-next-36+ #1
> [74073.055071] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [74073.056696] kworker/u32:10  D    0 17997      2 0x00000000
> [74073.058606] Workqueue: writeback wb_workfn (flush-btrfs-53176)
> [74073.061370]  ffff880031e79858 ffff8802159d2580 ffff880237004580 ffff880031e79240
> [74073.064784]  ffff88023f4978c0 ffffc9000817b638 ffffffff814c15e1 0000000000000000
> [74073.068386]  ffff88023f4978d8 ffff88023f4978c0 000000000017b620 ffff880031e79240
> [74073.071712] Call Trace:
> [74073.072884]  [<ffffffff814c15e1>] ? __schedule+0x48f/0x6f4
> [74073.075395]  [<ffffffff814c1c8b>] ? bit_wait+0x2f/0x2f
> [74073.077511]  [<ffffffff814c18d2>] schedule+0x8c/0xa0
> [74073.079440]  [<ffffffff814c4b36>] schedule_timeout+0x43/0xff
> [74073.081637]  [<ffffffff8110953e>] ? time_hardirqs_on+0x9/0x14
> [74073.083809]  [<ffffffff81095c67>] ? trace_hardirqs_on_caller+0x16/0x197
> [74073.086314]  [<ffffffff810bde98>] ? timekeeping_get_ns+0x1e/0x32
> [74073.100654]  [<ffffffff810be048>] ? ktime_get+0x41/0x52
> [74073.102619]  [<ffffffff814c10f0>] io_schedule_timeout+0xa0/0x102
> [74073.104771]  [<ffffffff814c10f0>] ? io_schedule_timeout+0xa0/0x102
> [74073.106969]  [<ffffffff814c1ca6>] bit_wait_io+0x1b/0x39
> [74073.108954]  [<ffffffff814c1fb8>] __wait_on_bit_lock+0x4f/0x99
> [74073.110981]  [<ffffffff8112b692>] __lock_page+0x6b/0x6d
> [74073.112833]  [<ffffffff8108ceb4>] ? autoremove_wake_function+0x3a/0x3a
> [74073.115010]  [<ffffffffa031178b>] lock_page+0x2f/0x32 [btrfs]
> [74073.116999]  [<ffffffffa0311d9f>] lock_delalloc_pages+0xc7/0x1a0 [btrfs]
> [74073.119243]  [<ffffffffa0313d15>] find_lock_delalloc_range+0xc3/0x1a4 [btrfs]
> [74073.121636]  [<ffffffffa0313e81>] writepage_delalloc.isra.31+0x8b/0x134 [btrfs]
> [74073.124229]  [<ffffffffa0315d69>] __extent_writepage+0x1c1/0x2bf [btrfs]
> [74073.126372]  [<ffffffffa03160f2>] extent_write_cache_pages.isra.30.constprop.49+0x28b/0x36c [btrfs]
> [74073.129371]  [<ffffffffa03165b9>] extent_writepages+0x4b/0x5c [btrfs]
> [74073.131440]  [<ffffffffa02fcb59>] ? insert_reserved_file_extent.constprop.42+0x261/0x261 [btrfs]
> [74073.134303]  [<ffffffff811b4ce4>] ? writeback_sb_inodes+0xe0/0x4a1
> [74073.136298]  [<ffffffffa02fab7f>] btrfs_writepages+0x28/0x2a [btrfs]
> [74073.138248]  [<ffffffff81138200>] do_writepages+0x23/0x2c
> [74073.139910]  [<ffffffff811b3cab>] __writeback_single_inode+0x105/0x6d2
> [74073.142003]  [<ffffffff811b4e96>] writeback_sb_inodes+0x292/0x4a1
> [74073.136298]  [<ffffffffa02fab7f>] btrfs_writepages+0x28/0x2a [btrfs]
> [74073.138248]  [<ffffffff81138200>] do_writepages+0x23/0x2c
> [74073.139910]  [<ffffffff811b3cab>] __writeback_single_inode+0x105/0x6d2
> [74073.142003]  [<ffffffff811b4e96>] writeback_sb_inodes+0x292/0x4a1
> [74073.143911]  [<ffffffff811b511b>] __writeback_inodes_wb+0x76/0xae
> [74073.145787]  [<ffffffff811b53ca>] wb_writeback+0x1cc/0x4d7
> [74073.147452]  [<ffffffff811b60cd>] wb_workfn+0x194/0x37d
> [74073.149084]  [<ffffffff811b60cd>] ? wb_workfn+0x194/0x37d
> [74073.150726]  [<ffffffff8106ce77>] ? process_one_work+0x154/0x4e4
> [74073.152694]  [<ffffffff8106cf96>] process_one_work+0x273/0x4e4
> [74073.154452]  [<ffffffff8106d6db>] worker_thread+0x1eb/0x2ca
> [74073.156138]  [<ffffffff8106d4f0>] ? rescuer_thread+0x2b6/0x2b6
> [74073.157837]  [<ffffffff81072a81>] kthread+0xd5/0xdd
> [74073.159339]  [<ffffffff810729ac>] ? __kthread_unpark+0x5a/0x5a
> [74073.161088]  [<ffffffff814c6257>] ret_from_fork+0x27/0x40
> [74073.162680] INFO: lockdep is turned off.
> [74073.163855] INFO: task do-dedup:30264 blocked for more than 120 seconds.
> [74073.181180]       Tainted: G        W       4.9.0-rc7-btrfs-next-36+ #1
> [74073.181180] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [74073.185296] fdm-stress      D    0 30264  29974 0x00000000
> [74073.186810]  ffff880089595118 ffff880211b8eac0 ffff880237030380 ffff880089594b00
> [74073.188998]  ffff88023f2978c0 ffffc900063abb68 ffffffff814c15e1 0000000000000000
> [74073.191070]  ffff88023f2978d8 ffff88023f2978c0 00000000003abb50 ffff880089594b00
> [74073.193286] Call Trace:
> [74073.193990]  [<ffffffff814c15e1>] ? __schedule+0x48f/0x6f4
> [74073.195418]  [<ffffffff814c1c8b>] ? bit_wait+0x2f/0x2f
> [74073.196796]  [<ffffffff814c18d2>] schedule+0x8c/0xa0
> [74073.198163]  [<ffffffff814c4b36>] schedule_timeout+0x43/0xff
> [74073.199621]  [<ffffffff81095df5>] ? trace_hardirqs_on+0xd/0xf
> [74073.201100]  [<ffffffff810bde98>] ? timekeeping_get_ns+0x1e/0x32
> [74073.202686]  [<ffffffff810be048>] ? ktime_get+0x41/0x52
> [74073.204051]  [<ffffffff814c10f0>] io_schedule_timeout+0xa0/0x102
> [74073.205585]  [<ffffffff814c10f0>] ? io_schedule_timeout+0xa0/0x102
> [74073.207123]  [<ffffffff814c1ca6>] bit_wait_io+0x1b/0x39
> [74073.208238]  [<ffffffff814c1fb8>] __wait_on_bit_lock+0x4f/0x99
> [74073.208871]  [<ffffffff8112b692>] __lock_page+0x6b/0x6d
> [74073.209430]  [<ffffffff8108ceb4>] ? autoremove_wake_function+0x3a/0x3a
> [74073.210101]  [<ffffffff8112b800>] lock_page+0x2f/0x32
> [74073.210636]  [<ffffffff8112c502>] pagecache_get_page+0x5e/0x153
> [74073.211270]  [<ffffffffa03257eb>] gather_extent_pages+0x4e/0x109 [btrfs]
> [74073.212166]  [<ffffffffa032a04c>] btrfs_dedupe_file_range+0x1e1/0x4dd [btrfs]
> [74073.213257]  [<ffffffff8118d9b5>] vfs_dedupe_file_range+0x1c1/0x221
> [74073.214086]  [<ffffffff8119e0c4>] do_vfs_ioctl+0x442/0x600
> [74073.214767]  [<ffffffff811a7874>] ? rcu_read_unlock+0x5b/0x5d
> [74073.215619]  [<ffffffff811a7953>] ? __fget+0x6b/0x77
> [74073.216338]  [<ffffffff8119e2d9>] SyS_ioctl+0x57/0x79
> [74073.217149]  [<ffffffff814c5fea>] entry_SYSCALL_64_fastpath+0x18/0xad
> [74073.218102]  [<ffffffff81109552>] ? time_hardirqs_off+0x9/0x14
> [74073.218968]  [<ffffffff810938ce>] ? trace_hardirqs_off_caller+0x1f/0xaa
> [74073.219938] INFO: lockdep is turned off.
> 
> What happened was the following:
> 
>       CPU 1                                       CPU 2
> 
>                                              btrfs_dedupe_file_range()
>                                                --> using same inode as source
>                                                    and target
>                                                --> src range is [768K, 1Mb[
>                                                --> dst range is [0, 256K[
>                                               btrfs_cmp_data_prepare()
>                                                --> calls gather_extent_pages()
>                                                    for range [768K, 1Mb[ and
>                                                    locks all pages in that range
> 
>  do_writepages()
>   btrfs_writepages()
>    extent_writepages()
>     extent_write_cache_pages()
>      __extent_writepage()
>       writepage_delalloc()
>        find_lock_delalloc_range()
>          --> finds range [0, 1Mb[
>          lock_delalloc_pages()
>           --> locks all pages in the
>               range [0, 768K[
>           --> tries to lock page at
>               offset 768K
>                 --> deadlock
> 
>                                                --> calls gather_extent_pages()
>                                                    to lock pages in the range
>                                                    [0, 256K[
>                                                     --> deadlock, task at CPU 1
>                                                         already locked that
>                                                         range and it's trying
>                                                         to lock the range we
>                                                         locked previously
> 
> So fix this by making sure that during a dedup we always lock first the
> pages from the range with lower offset.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>  fs/btrfs/ioctl.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 0c77cad..4284279 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -3047,11 +3047,21 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
>  	cmp->src_pages = src_pgarr;
>  	cmp->dst_pages = dst_pgarr;
>  
> -	ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
> +	/*
> +	 * If deduping ranges in the same inode, locking rules make it mandatory
> +	 * to always lock pages in ascending order to avoid deadlocks with
> +	 * concurrent tasks (such as starting writeback/delalloc).
> +	 */
> +	if (src == dst && dst_loff < loff) {
> +		swap(src_pgarr, dst_pgarr);
> +		swap(loff, dst_loff);
> +	}
> +
> +	ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
>  	if (ret)
>  		goto out;
>  
> -	ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
> +	ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
>  
>  out:
>  	if (ret)
> -- 
> 2.7.0.rc3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0c77cad..4284279 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3047,11 +3047,21 @@  static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
 	cmp->src_pages = src_pgarr;
 	cmp->dst_pages = dst_pgarr;
 
-	ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+	/*
+	 * If deduping ranges in the same inode, locking rules make it mandatory
+	 * to always lock pages in ascending order to avoid deadlocks with
+	 * concurrent tasks (such as starting writeback/delalloc).
+	 */
+	if (src == dst && dst_loff < loff) {
+		swap(src_pgarr, dst_pgarr);
+		swap(loff, dst_loff);
+	}
+
+	ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
 	if (ret)
 		goto out;
 
-	ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+	ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
 
 out:
 	if (ret)