diff mbox

btrfs: Handle btrfs_reloc_clone_csums error correctly to avoid deadlock

Message ID 20170215084924.10587-1-quwenruo@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show

Commit Message

Qu Wenruo Feb. 15, 2017, 8:49 a.m. UTC
If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:

Call Trace:
 __schedule+0x2d4/0xae0
 schedule+0x3d/0x90
 btrfs_start_ordered_extent+0x160/0x200 [btrfs]
 ? wake_atomic_t_function+0x60/0x60
 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
 btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
 btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
 process_one_work+0x2af/0x720
 ? process_one_work+0x22b/0x720
 worker_thread+0x4b/0x4f0
 kthread+0x10f/0x150
 ? process_one_work+0x720/0x720
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2e/0x40

The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.

The related part call path will be:
__extent_writepage
|- writepage_delalloc()
|  |- run_delalloc_range()
|     |- run_delalloc_nocow()
|        |- btrfs_add_ordered_extent()
|        |  Now one ordered extent for file range, e.g [0, 1M) is inserted
|        |
|        |- btrfs_reloc_clone_csums()
|        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
|        |  blocks
|        |
|        |- extent_clear_unlock_delalloc()
|           Error routine, unlock and clear page DIRTY, end page writeback
|           So the remaining 255 pages will not go through writeback
|
|- __extent_writepage_io()
   |- writepage_end_io_hook()  
      |- btrfs_dev_test_ordered_pending()
         Reduce ordered_extent->bytes_left by 4K.
         Still have (1M - 4K) to finish.

While the remaining 255 pages will not go through IO nor trigger
writepage_end_io_hook(), the ordered extent for [0, 1M) will
never finish, and blocking current transaction forever.

Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.

This patch will cleanup the ordered extent in error routine, so at least
we won't cause deadlock.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c    |  1 -
 fs/btrfs/inode.c        | 10 ++++++++--
 fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
 fs/btrfs/ordered-data.h | 10 ++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)

Comments

Filipe Manana Feb. 15, 2017, 3:59 p.m. UTC | #1
On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
> btrfs will block with the following backtrace:
>
> Call Trace:
>  __schedule+0x2d4/0xae0
>  schedule+0x3d/0x90
>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>  ? wake_atomic_t_function+0x60/0x60
>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>  process_one_work+0x2af/0x720
>  ? process_one_work+0x22b/0x720
>  worker_thread+0x4b/0x4f0
>  kthread+0x10f/0x150
>  ? process_one_work+0x720/0x720
>  ? kthread_create_on_node+0x40/0x40
>  ret_from_fork+0x2e/0x40
>
> The direct cause is the error handler in run_delalloc_nocow() doesn't
> handle error from btrfs_reloc_clone_csums() well.
>
> The related part call path will be:
> __extent_writepage
> |- writepage_delalloc()
> |  |- run_delalloc_range()
> |     |- run_delalloc_nocow()
> |        |- btrfs_add_ordered_extent()
> |        |  Now one ordered extent for file range, e.g [0, 1M) is inserted
> |        |
> |        |- btrfs_reloc_clone_csums()
> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
> |        |  blocks
> |        |
> |        |- extent_clear_unlock_delalloc()
> |           Error routine, unlock and clear page DIRTY, end page writeback
> |           So the remaining 255 pages will not go through writeback
> |
> |- __extent_writepage_io()
>    |- writepage_end_io_hook()
>       |- btrfs_dev_test_ordered_pending()
>          Reduce ordered_extent->bytes_left by 4K.
>          Still have (1M - 4K) to finish.
>
> While the remaining 255 pages will not go through IO nor trigger
> writepage_end_io_hook(), the ordered extent for [0, 1M) will
> never finish, and blocking current transaction forever.
>
> Although the root cause is still in RAID5/6, it won't hurt to fix the
> error routine first.
>
> This patch will cleanup the ordered extent in error routine, so at least
> we won't cause deadlock.
>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/extent_io.c    |  1 -
>  fs/btrfs/inode.c        | 10 ++++++++--
>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>  fs/btrfs/ordered-data.h | 10 ++++++++++
>  4 files changed, 43 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4ac383a3a649..a14d1b0840c5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3258,7 +3258,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
>                                                delalloc_end,
>                                                &page_started,
>                                                nr_written);
> -               /* File system has been set read-only */
>                 if (ret) {
>                         SetPageError(page);
>                         /* fill_delalloc should be return < 0 for error
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 1e861a063721..3c3ade58afd7 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode *inode,
>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>                         ret = btrfs_reloc_clone_csums(inode, start,
>                                                       cur_alloc_size);
> -                       if (ret)
> +                       if (ret) {
> +                               btrfs_clean_ordered_extent(inode, start,
> +                                                          ram_size);
>                                 goto out_drop_extent_cache;
> +                       }
>                 }
>
>                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
>         if (!ret)
>                 ret = err;
>
> -       if (ret && cur_offset < end)
> +       if (ret && cur_offset < end) {
>                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
>                                              locked_page, EXTENT_LOCKED |
>                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
>                                              PAGE_CLEAR_DIRTY |
>                                              PAGE_SET_WRITEBACK |
>                                              PAGE_END_WRITEBACK);
> +               btrfs_clean_ordered_extent(inode, cur_offset,
> +                                          end - cur_offset + 1);

So this is partially correct only.
First here you can have 0, 1 or more ordered extents that were created
in the while loop. So your new function must find and process all
ordered extents within the delalloc range and not just the last one
that was created.

Also, for any created ordered extent, you want to set the bit
BTRFS_ORDERED_IOERR on it before removing it, since there might be
concurrent tasks waiting for it that do error handling (like an fsync,
since when running delalloc the inode isn't locked).

Look at the direct IO write path error handling and
btrfs_endio_direct_write_update_ordered() for a very similar scenario.
And the same problem happens in the cow case (inode.c:cow_file_range()).

Thanks.

> +       }
>         btrfs_free_path(path);
>         return ret;
>  }
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 041c3326d109..dba1cf3464a7 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode *inode,
>         wake_up(&entry->wait);
>  }
>
> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
> +                               u64 ram_len)
> +{
> +       struct btrfs_ordered_extent *entry;
> +       struct btrfs_root *root = BTRFS_I(inode)->root;
> +
> +       entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
> +       if (!entry || entry->file_offset != file_offset ||
> +           entry->len != ram_len)
> +               goto not_found;
> +
> +       /* Same as btrfs_finish_ordered_io() */
> +       btrfs_remove_ordered_extent(inode, entry);
> +       btrfs_put_ordered_extent(entry);
> +       btrfs_put_ordered_extent(entry);
> +       return;
> +
> +not_found:
> +       WARN_ON(1);
> +       btrfs_err(root->fs_info,
> +       "failed to find and clean ordered extent: root %llu ino %llu file_offset %llu len %llu",
> +                 root->objectid, btrfs_ino(inode), file_offset, ram_len);
> +       return;
> +}
> +
>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>  {
>         struct btrfs_ordered_extent *ordered;
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index 5f2b0ca28705..7a989778aa89 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>  void btrfs_remove_ordered_extent(struct inode *inode,
>                                 struct btrfs_ordered_extent *entry);
> +
> +/*
> + * Function to cleanup an allocated ordered extent in error routine.
> + *
> + * As error handler in run_delalloc_range() will clear all related pages
> + * and skip their IO, we have no method to finish inserted ordered extent.
> + * So we must use this function to clean it up.
> + */
> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
> +                               u64 ram_len);
>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>                                    struct btrfs_ordered_extent **cached,
>                                    u64 file_offset, u64 io_size, int uptodate);
> --
> 2.11.1
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Feb. 16, 2017, 12:39 a.m. UTC | #2
At 02/15/2017 11:59 PM, Filipe Manana wrote:
> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>> btrfs will block with the following backtrace:
>>
>> Call Trace:
>>  __schedule+0x2d4/0xae0
>>  schedule+0x3d/0x90
>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>  ? wake_atomic_t_function+0x60/0x60
>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>  process_one_work+0x2af/0x720
>>  ? process_one_work+0x22b/0x720
>>  worker_thread+0x4b/0x4f0
>>  kthread+0x10f/0x150
>>  ? process_one_work+0x720/0x720
>>  ? kthread_create_on_node+0x40/0x40
>>  ret_from_fork+0x2e/0x40
>>
>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>> handle error from btrfs_reloc_clone_csums() well.
>>
>> The related part call path will be:
>> __extent_writepage
>> |- writepage_delalloc()
>> |  |- run_delalloc_range()
>> |     |- run_delalloc_nocow()
>> |        |- btrfs_add_ordered_extent()
>> |        |  Now one ordered extent for file range, e.g [0, 1M) is inserted
>> |        |
>> |        |- btrfs_reloc_clone_csums()
>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>> |        |  blocks
>> |        |
>> |        |- extent_clear_unlock_delalloc()
>> |           Error routine, unlock and clear page DIRTY, end page writeback
>> |           So the remaining 255 pages will not go through writeback
>> |
>> |- __extent_writepage_io()
>>    |- writepage_end_io_hook()
>>       |- btrfs_dev_test_ordered_pending()
>>          Reduce ordered_extent->bytes_left by 4K.
>>          Still have (1M - 4K) to finish.
>>
>> While the remaining 255 pages will not go through IO nor trigger
>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>> never finish, and blocking current transaction forever.
>>
>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>> error routine first.
>>
>> This patch will cleanup the ordered extent in error routine, so at least
>> we won't cause deadlock.
>>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/extent_io.c    |  1 -
>>  fs/btrfs/inode.c        | 10 ++++++++--
>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 4ac383a3a649..a14d1b0840c5 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
>>                                                delalloc_end,
>>                                                &page_started,
>>                                                nr_written);
>> -               /* File system has been set read-only */
>>                 if (ret) {
>>                         SetPageError(page);
>>                         /* fill_delalloc should be return < 0 for error
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 1e861a063721..3c3ade58afd7 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode *inode,
>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>                                                       cur_alloc_size);
>> -                       if (ret)
>> +                       if (ret) {
>> +                               btrfs_clean_ordered_extent(inode, start,
>> +                                                          ram_size);
>>                                 goto out_drop_extent_cache;
>> +                       }
>>                 }
>>
>>                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
>>         if (!ret)
>>                 ret = err;
>>
>> -       if (ret && cur_offset < end)
>> +       if (ret && cur_offset < end) {
>>                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
>>                                              locked_page, EXTENT_LOCKED |
>>                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
>>                                              PAGE_CLEAR_DIRTY |
>>                                              PAGE_SET_WRITEBACK |
>>                                              PAGE_END_WRITEBACK);
>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>> +                                          end - cur_offset + 1);
>
> So this is partially correct only.
> First here you can have 0, 1 or more ordered extents that were created
> in the while loop. So your new function must find and process all
> ordered extents within the delalloc range and not just the last one
> that was created.

OK, I found that only btrfs_reloc_clone_csums() will need to cleanup 
ordered extent, and there is no other case which can cause error after 
adding ordered extent.

So what about moving the btrfs_clean_ordered_extent() call to just after 
btrfs_reloc_clone_csums() instead of putting it under error tag?

In that case, since we haven't unlock pages/extent, there will no race 
nor new ordered extent, and we are ensured to have only one ordered 
extent need cleanup.

>
> Also, for any created ordered extent, you want to set the bit
> BTRFS_ORDERED_IOERR on it before removing it, since there might be
> concurrent tasks waiting for it that do error handling (like an fsync,
> since when running delalloc the inode isn't locked).

Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called 
for DATA_RELOC tree inode, which we don't call fsync on it.

>
> Look at the direct IO write path error handling and
> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
> And the same problem happens in the cow case (inode.c:cow_file_range()).

BTW, I have already done it in cow_file_range(), check the beginning 
lines of inode.c of this patch.

Thanks,
Qu
>
> Thanks.
>
>> +       }
>>         btrfs_free_path(path);
>>         return ret;
>>  }
>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>> index 041c3326d109..dba1cf3464a7 100644
>> --- a/fs/btrfs/ordered-data.c
>> +++ b/fs/btrfs/ordered-data.c
>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode *inode,
>>         wake_up(&entry->wait);
>>  }
>>
>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>> +                               u64 ram_len)
>> +{
>> +       struct btrfs_ordered_extent *entry;
>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>> +
>> +       entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
>> +       if (!entry || entry->file_offset != file_offset ||
>> +           entry->len != ram_len)
>> +               goto not_found;
>> +
>> +       /* Same as btrfs_finish_ordered_io() */
>> +       btrfs_remove_ordered_extent(inode, entry);
>> +       btrfs_put_ordered_extent(entry);
>> +       btrfs_put_ordered_extent(entry);
>> +       return;
>> +
>> +not_found:
>> +       WARN_ON(1);
>> +       btrfs_err(root->fs_info,
>> +       "failed to find and clean ordered extent: root %llu ino %llu file_offset %llu len %llu",
>> +                 root->objectid, btrfs_ino(inode), file_offset, ram_len);
>> +       return;
>> +}
>> +
>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>  {
>>         struct btrfs_ordered_extent *ordered;
>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>> index 5f2b0ca28705..7a989778aa89 100644
>> --- a/fs/btrfs/ordered-data.h
>> +++ b/fs/btrfs/ordered-data.h
>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>                                 struct btrfs_ordered_extent *entry);
>> +
>> +/*
>> + * Function to cleanup an allocated ordered extent in error routine.
>> + *
>> + * As error handler in run_delalloc_range() will clear all related pages
>> + * and skip their IO, we have no method to finish inserted ordered extent.
>> + * So we must use this function to clean it up.
>> + */
>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>> +                               u64 ram_len);
>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>                                    struct btrfs_ordered_extent **cached,
>>                                    u64 file_offset, u64 io_size, int uptodate);
>> --
>> 2.11.1
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana Feb. 16, 2017, 10:03 a.m. UTC | #3
On Thu, Feb 16, 2017 at 12:39 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>
>
> At 02/15/2017 11:59 PM, Filipe Manana wrote:
>>
>> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>> wrote:
>>>
>>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>>> btrfs will block with the following backtrace:
>>>
>>> Call Trace:
>>>  __schedule+0x2d4/0xae0
>>>  schedule+0x3d/0x90
>>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>>  ? wake_atomic_t_function+0x60/0x60
>>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>>  process_one_work+0x2af/0x720
>>>  ? process_one_work+0x22b/0x720
>>>  worker_thread+0x4b/0x4f0
>>>  kthread+0x10f/0x150
>>>  ? process_one_work+0x720/0x720
>>>  ? kthread_create_on_node+0x40/0x40
>>>  ret_from_fork+0x2e/0x40
>>>
>>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>>> handle error from btrfs_reloc_clone_csums() well.
>>>
>>> The related part call path will be:
>>> __extent_writepage
>>> |- writepage_delalloc()
>>> |  |- run_delalloc_range()
>>> |     |- run_delalloc_nocow()
>>> |        |- btrfs_add_ordered_extent()
>>> |        |  Now one ordered extent for file range, e.g [0, 1M) is
>>> inserted
>>> |        |
>>> |        |- btrfs_reloc_clone_csums()
>>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>>> |        |  blocks
>>> |        |
>>> |        |- extent_clear_unlock_delalloc()
>>> |           Error routine, unlock and clear page DIRTY, end page
>>> writeback
>>> |           So the remaining 255 pages will not go through writeback
>>> |
>>> |- __extent_writepage_io()
>>>    |- writepage_end_io_hook()
>>>       |- btrfs_dev_test_ordered_pending()
>>>          Reduce ordered_extent->bytes_left by 4K.
>>>          Still have (1M - 4K) to finish.
>>>
>>> While the remaining 255 pages will not go through IO nor trigger
>>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>>> never finish, and blocking current transaction forever.
>>>
>>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>>> error routine first.
>>>
>>> This patch will cleanup the ordered extent in error routine, so at least
>>> we won't cause deadlock.
>>>
>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>> ---
>>>  fs/btrfs/extent_io.c    |  1 -
>>>  fs/btrfs/inode.c        | 10 ++++++++--
>>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index 4ac383a3a649..a14d1b0840c5 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int
>>> writepage_delalloc(struct inode *inode,
>>>                                                delalloc_end,
>>>                                                &page_started,
>>>                                                nr_written);
>>> -               /* File system has been set read-only */
>>>                 if (ret) {
>>>                         SetPageError(page);
>>>                         /* fill_delalloc should be return < 0 for error
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index 1e861a063721..3c3ade58afd7 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode
>>> *inode,
>>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>>                                                       cur_alloc_size);
>>> -                       if (ret)
>>> +                       if (ret) {
>>> +                               btrfs_clean_ordered_extent(inode, start,
>>> +                                                          ram_size);
>>>                                 goto out_drop_extent_cache;
>>> +                       }
>>>                 }
>>>
>>>                 btrfs_dec_block_group_reservations(fs_info,
>>> ins.objectid);
>>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct inode
>>> *inode,
>>>         if (!ret)
>>>                 ret = err;
>>>
>>> -       if (ret && cur_offset < end)
>>> +       if (ret && cur_offset < end) {
>>>                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
>>>                                              locked_page, EXTENT_LOCKED |
>>>                                              EXTENT_DELALLOC |
>>> EXTENT_DEFRAG |
>>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct inode
>>> *inode,
>>>                                              PAGE_CLEAR_DIRTY |
>>>                                              PAGE_SET_WRITEBACK |
>>>                                              PAGE_END_WRITEBACK);
>>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>>> +                                          end - cur_offset + 1);
>>
>>
>> So this is partially correct only.
>> First here you can have 0, 1 or more ordered extents that were created
>> in the while loop. So your new function must find and process all
>> ordered extents within the delalloc range and not just the last one
>> that was created.
>
>
> OK, I found that only btrfs_reloc_clone_csums() will need to cleanup ordered
> extent, and there is no other case which can cause error after adding
> ordered extent.
>
> So what about moving the btrfs_clean_ordered_extent() call to just after
> btrfs_reloc_clone_csums() instead of putting it under error tag?

Unfortunately it's not as simple as that, because when it fails, past
iterations of the while loop have created other ordered extents.
Same goes for cow_file_range().

So we really need something like what is done for the error path of
dio writes, which gets rid of all ordered extents and sets the IO
error bit on them as well.

>
> In that case, since we haven't unlock pages/extent, there will no race nor
> new ordered extent, and we are ensured to have only one ordered extent need
> cleanup.
>
>>
>> Also, for any created ordered extent, you want to set the bit
>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>> concurrent tasks waiting for it that do error handling (like an fsync,
>> since when running delalloc the inode isn't locked).
>
>
> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called for
> DATA_RELOC tree inode, which we don't call fsync on it.
>
>>
>> Look at the direct IO write path error handling and
>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>> And the same problem happens in the cow case (inode.c:cow_file_range()).
>
>
> BTW, I have already done it in cow_file_range(), check the beginning lines
> of inode.c of this patch.

Indeed. I missed it.

Thanks.

>
> Thanks,
> Qu
>
>>
>> Thanks.
>>
>>> +       }
>>>         btrfs_free_path(path);
>>>         return ret;
>>>  }
>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>> index 041c3326d109..dba1cf3464a7 100644
>>> --- a/fs/btrfs/ordered-data.c
>>> +++ b/fs/btrfs/ordered-data.c
>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>> *inode,
>>>         wake_up(&entry->wait);
>>>  }
>>>
>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>> +                               u64 ram_len)
>>> +{
>>> +       struct btrfs_ordered_extent *entry;
>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>> +
>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
>>> +       if (!entry || entry->file_offset != file_offset ||
>>> +           entry->len != ram_len)
>>> +               goto not_found;
>>> +
>>> +       /* Same as btrfs_finish_ordered_io() */
>>> +       btrfs_remove_ordered_extent(inode, entry);
>>> +       btrfs_put_ordered_extent(entry);
>>> +       btrfs_put_ordered_extent(entry);
>>> +       return;
>>> +
>>> +not_found:
>>> +       WARN_ON(1);
>>> +       btrfs_err(root->fs_info,
>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>> file_offset %llu len %llu",
>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>> ram_len);
>>> +       return;
>>> +}
>>> +
>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>  {
>>>         struct btrfs_ordered_extent *ordered;
>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>> index 5f2b0ca28705..7a989778aa89 100644
>>> --- a/fs/btrfs/ordered-data.h
>>> +++ b/fs/btrfs/ordered-data.h
>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>> btrfs_ordered_inode_tree *t)
>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>                                 struct btrfs_ordered_extent *entry);
>>> +
>>> +/*
>>> + * Function to cleanup an allocated ordered extent in error routine.
>>> + *
>>> + * As error handler in run_delalloc_range() will clear all related pages
>>> + * and skip their IO, we have no method to finish inserted ordered
>>> extent.
>>> + * So we must use this function to clean it up.
>>> + */
>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>> +                               u64 ram_len);
>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>                                    struct btrfs_ordered_extent **cached,
>>>                                    u64 file_offset, u64 io_size, int
>>> uptodate);
>>> --
>>> 2.11.1
>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>>
>
>
Qu Wenruo Feb. 17, 2017, 12:37 a.m. UTC | #4
At 02/16/2017 06:03 PM, Filipe Manana wrote:
> On Thu, Feb 16, 2017 at 12:39 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>>
>>
>> At 02/15/2017 11:59 PM, Filipe Manana wrote:
>>>
>>> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>> wrote:
>>>>
>>>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>>>> btrfs will block with the following backtrace:
>>>>
>>>> Call Trace:
>>>>  __schedule+0x2d4/0xae0
>>>>  schedule+0x3d/0x90
>>>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>>>  ? wake_atomic_t_function+0x60/0x60
>>>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>>>  process_one_work+0x2af/0x720
>>>>  ? process_one_work+0x22b/0x720
>>>>  worker_thread+0x4b/0x4f0
>>>>  kthread+0x10f/0x150
>>>>  ? process_one_work+0x720/0x720
>>>>  ? kthread_create_on_node+0x40/0x40
>>>>  ret_from_fork+0x2e/0x40
>>>>
>>>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>>>> handle error from btrfs_reloc_clone_csums() well.
>>>>
>>>> The related part call path will be:
>>>> __extent_writepage
>>>> |- writepage_delalloc()
>>>> |  |- run_delalloc_range()
>>>> |     |- run_delalloc_nocow()
>>>> |        |- btrfs_add_ordered_extent()
>>>> |        |  Now one ordered extent for file range, e.g [0, 1M) is
>>>> inserted
>>>> |        |
>>>> |        |- btrfs_reloc_clone_csums()
>>>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>>>> |        |  blocks
>>>> |        |
>>>> |        |- extent_clear_unlock_delalloc()
>>>> |           Error routine, unlock and clear page DIRTY, end page
>>>> writeback
>>>> |           So the remaining 255 pages will not go through writeback
>>>> |
>>>> |- __extent_writepage_io()
>>>>    |- writepage_end_io_hook()
>>>>       |- btrfs_dev_test_ordered_pending()
>>>>          Reduce ordered_extent->bytes_left by 4K.
>>>>          Still have (1M - 4K) to finish.
>>>>
>>>> While the remaining 255 pages will not go through IO nor trigger
>>>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>>>> never finish, and blocking current transaction forever.
>>>>
>>>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>>>> error routine first.
>>>>
>>>> This patch will cleanup the ordered extent in error routine, so at least
>>>> we won't cause deadlock.
>>>>
>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> ---
>>>>  fs/btrfs/extent_io.c    |  1 -
>>>>  fs/btrfs/inode.c        | 10 ++++++++--
>>>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>> index 4ac383a3a649..a14d1b0840c5 100644
>>>> --- a/fs/btrfs/extent_io.c
>>>> +++ b/fs/btrfs/extent_io.c
>>>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int
>>>> writepage_delalloc(struct inode *inode,
>>>>                                                delalloc_end,
>>>>                                                &page_started,
>>>>                                                nr_written);
>>>> -               /* File system has been set read-only */
>>>>                 if (ret) {
>>>>                         SetPageError(page);
>>>>                         /* fill_delalloc should be return < 0 for error
>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>> index 1e861a063721..3c3ade58afd7 100644
>>>> --- a/fs/btrfs/inode.c
>>>> +++ b/fs/btrfs/inode.c
>>>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode
>>>> *inode,
>>>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>>>                                                       cur_alloc_size);
>>>> -                       if (ret)
>>>> +                       if (ret) {
>>>> +                               btrfs_clean_ordered_extent(inode, start,
>>>> +                                                          ram_size);
>>>>                                 goto out_drop_extent_cache;
>>>> +                       }
>>>>                 }
>>>>
>>>>                 btrfs_dec_block_group_reservations(fs_info,
>>>> ins.objectid);
>>>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct inode
>>>> *inode,
>>>>         if (!ret)
>>>>                 ret = err;
>>>>
>>>> -       if (ret && cur_offset < end)
>>>> +       if (ret && cur_offset < end) {
>>>>                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
>>>>                                              locked_page, EXTENT_LOCKED |
>>>>                                              EXTENT_DELALLOC |
>>>> EXTENT_DEFRAG |
>>>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct inode
>>>> *inode,
>>>>                                              PAGE_CLEAR_DIRTY |
>>>>                                              PAGE_SET_WRITEBACK |
>>>>                                              PAGE_END_WRITEBACK);
>>>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>>>> +                                          end - cur_offset + 1);
>>>
>>>
>>> So this is partially correct only.
>>> First here you can have 0, 1 or more ordered extents that were created
>>> in the while loop. So your new function must find and process all
>>> ordered extents within the delalloc range and not just the last one
>>> that was created.
>>
>>
>> OK, I found that only btrfs_reloc_clone_csums() will need to cleanup ordered
>> extent, and there is no other case which can cause error after adding
>> ordered extent.
>>
>> So what about moving the btrfs_clean_ordered_extent() call to just after
>> btrfs_reloc_clone_csums() instead of putting it under error tag?
>
> Unfortunately it's not as simple as that, because when it fails, past
> iterations of the while loop have created other ordered extents.
> Same goes for cow_file_range().
>
> So we really need something like what is done for the error path of
> dio writes, which gets rid of all ordered extents and sets the IO
> error bit on them as well.

The IO error bit makes sense.

While I'm still not sure the necessity to cleanup previously created 
ordered extent, since they could finish using endio functions.

Or is this a designed behavior for things like fsync?
Success for all or failure if any fails?

Thanks,
Qu
>
>>
>> In that case, since we haven't unlock pages/extent, there will no race nor
>> new ordered extent, and we are ensured to have only one ordered extent need
>> cleanup.
>>
>>>
>>> Also, for any created ordered extent, you want to set the bit
>>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>>> concurrent tasks waiting for it that do error handling (like an fsync,
>>> since when running delalloc the inode isn't locked).
>>
>>
>> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called for
>> DATA_RELOC tree inode, which we don't call fsync on it.
>>
>>>
>>> Look at the direct IO write path error handling and
>>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>>> And the same problem happens in the cow case (inode.c:cow_file_range()).
>>
>>
>> BTW, I have already done it in cow_file_range(), check the beginning lines
>> of inode.c of this patch.
>
> Indeed. I missed it.
>
> Thanks.
>
>>
>> Thanks,
>> Qu
>>
>>>
>>> Thanks.
>>>
>>>> +       }
>>>>         btrfs_free_path(path);
>>>>         return ret;
>>>>  }
>>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>>> index 041c3326d109..dba1cf3464a7 100644
>>>> --- a/fs/btrfs/ordered-data.c
>>>> +++ b/fs/btrfs/ordered-data.c
>>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>>> *inode,
>>>>         wake_up(&entry->wait);
>>>>  }
>>>>
>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>> +                               u64 ram_len)
>>>> +{
>>>> +       struct btrfs_ordered_extent *entry;
>>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>>> +
>>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
>>>> +       if (!entry || entry->file_offset != file_offset ||
>>>> +           entry->len != ram_len)
>>>> +               goto not_found;
>>>> +
>>>> +       /* Same as btrfs_finish_ordered_io() */
>>>> +       btrfs_remove_ordered_extent(inode, entry);
>>>> +       btrfs_put_ordered_extent(entry);
>>>> +       btrfs_put_ordered_extent(entry);
>>>> +       return;
>>>> +
>>>> +not_found:
>>>> +       WARN_ON(1);
>>>> +       btrfs_err(root->fs_info,
>>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>>> file_offset %llu len %llu",
>>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>>> ram_len);
>>>> +       return;
>>>> +}
>>>> +
>>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>>  {
>>>>         struct btrfs_ordered_extent *ordered;
>>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>>> index 5f2b0ca28705..7a989778aa89 100644
>>>> --- a/fs/btrfs/ordered-data.h
>>>> +++ b/fs/btrfs/ordered-data.h
>>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>>> btrfs_ordered_inode_tree *t)
>>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>>                                 struct btrfs_ordered_extent *entry);
>>>> +
>>>> +/*
>>>> + * Function to cleanup an allocated ordered extent in error routine.
>>>> + *
>>>> + * As error handler in run_delalloc_range() will clear all related pages
>>>> + * and skip their IO, we have no method to finish inserted ordered
>>>> extent.
>>>> + * So we must use this function to clean it up.
>>>> + */
>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>> +                               u64 ram_len);
>>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>>                                    struct btrfs_ordered_extent **cached,
>>>>                                    u64 file_offset, u64 io_size, int
>>>> uptodate);
>>>> --
>>>> 2.11.1
>>>>
>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>
>>>
>>
>>
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana Feb. 17, 2017, 3:25 p.m. UTC | #5
On Fri, Feb 17, 2017 at 12:37 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>
>
> At 02/16/2017 06:03 PM, Filipe Manana wrote:
>>
>> On Thu, Feb 16, 2017 at 12:39 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>> wrote:
>>>
>>>
>>>
>>> At 02/15/2017 11:59 PM, Filipe Manana wrote:
>>>>
>>>>
>>>> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>>>>> btrfs will block with the following backtrace:
>>>>>
>>>>> Call Trace:
>>>>>  __schedule+0x2d4/0xae0
>>>>>  schedule+0x3d/0x90
>>>>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>>>>  ? wake_atomic_t_function+0x60/0x60
>>>>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>>>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>>>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>>>>  process_one_work+0x2af/0x720
>>>>>  ? process_one_work+0x22b/0x720
>>>>>  worker_thread+0x4b/0x4f0
>>>>>  kthread+0x10f/0x150
>>>>>  ? process_one_work+0x720/0x720
>>>>>  ? kthread_create_on_node+0x40/0x40
>>>>>  ret_from_fork+0x2e/0x40
>>>>>
>>>>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>>>>> handle error from btrfs_reloc_clone_csums() well.
>>>>>
>>>>> The related part call path will be:
>>>>> __extent_writepage
>>>>> |- writepage_delalloc()
>>>>> |  |- run_delalloc_range()
>>>>> |     |- run_delalloc_nocow()
>>>>> |        |- btrfs_add_ordered_extent()
>>>>> |        |  Now one ordered extent for file range, e.g [0, 1M) is
>>>>> inserted
>>>>> |        |
>>>>> |        |- btrfs_reloc_clone_csums()
>>>>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>>>>> |        |  blocks
>>>>> |        |
>>>>> |        |- extent_clear_unlock_delalloc()
>>>>> |           Error routine, unlock and clear page DIRTY, end page
>>>>> writeback
>>>>> |           So the remaining 255 pages will not go through writeback
>>>>> |
>>>>> |- __extent_writepage_io()
>>>>>    |- writepage_end_io_hook()
>>>>>       |- btrfs_dev_test_ordered_pending()
>>>>>          Reduce ordered_extent->bytes_left by 4K.
>>>>>          Still have (1M - 4K) to finish.
>>>>>
>>>>> While the remaining 255 pages will not go through IO nor trigger
>>>>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>>>>> never finish, and blocking current transaction forever.
>>>>>
>>>>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>>>>> error routine first.
>>>>>
>>>>> This patch will cleanup the ordered extent in error routine, so at
>>>>> least
>>>>> we won't cause deadlock.
>>>>>
>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>> ---
>>>>>  fs/btrfs/extent_io.c    |  1 -
>>>>>  fs/btrfs/inode.c        | 10 ++++++++--
>>>>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>>>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>>>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>>> index 4ac383a3a649..a14d1b0840c5 100644
>>>>> --- a/fs/btrfs/extent_io.c
>>>>> +++ b/fs/btrfs/extent_io.c
>>>>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int
>>>>> writepage_delalloc(struct inode *inode,
>>>>>                                                delalloc_end,
>>>>>                                                &page_started,
>>>>>                                                nr_written);
>>>>> -               /* File system has been set read-only */
>>>>>                 if (ret) {
>>>>>                         SetPageError(page);
>>>>>                         /* fill_delalloc should be return < 0 for error
>>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>>> index 1e861a063721..3c3ade58afd7 100644
>>>>> --- a/fs/btrfs/inode.c
>>>>> +++ b/fs/btrfs/inode.c
>>>>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode
>>>>> *inode,
>>>>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>>>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>>>>                                                       cur_alloc_size);
>>>>> -                       if (ret)
>>>>> +                       if (ret) {
>>>>> +                               btrfs_clean_ordered_extent(inode,
>>>>> start,
>>>>> +                                                          ram_size);
>>>>>                                 goto out_drop_extent_cache;
>>>>> +                       }
>>>>>                 }
>>>>>
>>>>>                 btrfs_dec_block_group_reservations(fs_info,
>>>>> ins.objectid);
>>>>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct
>>>>> inode
>>>>> *inode,
>>>>>         if (!ret)
>>>>>                 ret = err;
>>>>>
>>>>> -       if (ret && cur_offset < end)
>>>>> +       if (ret && cur_offset < end) {
>>>>>                 extent_clear_unlock_delalloc(inode, cur_offset, end,
>>>>> end,
>>>>>                                              locked_page, EXTENT_LOCKED
>>>>> |
>>>>>                                              EXTENT_DELALLOC |
>>>>> EXTENT_DEFRAG |
>>>>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct
>>>>> inode
>>>>> *inode,
>>>>>                                              PAGE_CLEAR_DIRTY |
>>>>>                                              PAGE_SET_WRITEBACK |
>>>>>                                              PAGE_END_WRITEBACK);
>>>>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>>>>> +                                          end - cur_offset + 1);
>>>>
>>>>
>>>>
>>>> So this is partially correct only.
>>>> First here you can have 0, 1 or more ordered extents that were created
>>>> in the while loop. So your new function must find and process all
>>>> ordered extents within the delalloc range and not just the last one
>>>> that was created.
>>>
>>>
>>>
>>> OK, I found that only btrfs_reloc_clone_csums() will need to cleanup
>>> ordered
>>> extent, and there is no other case which can cause error after adding
>>> ordered extent.
>>>
>>> So what about moving the btrfs_clean_ordered_extent() call to just after
>>> btrfs_reloc_clone_csums() instead of putting it under error tag?
>>
>>
>> Unfortunately it's not as simple as that, because when it fails, past
>> iterations of the while loop have created other ordered extents.
>> Same goes for cow_file_range().
>>
>> So we really need something like what is done for the error path of
>> dio writes, which gets rid of all ordered extents and sets the IO
>> error bit on them as well.
>
>
> The IO error bit makes sense.
>
> While I'm still not sure the necessity to cleanup previously created ordered
> extent, since they could finish using endio functions.

In case of an error the endio handler is called only for the first
page, and not for whole range for which one or more ordered extents
were created. So the ordered extents will be around forever.

>
> Or is this a designed behavior for things like fsync?
> Success for all or failure if any fails?

This is unrelated to fsync. It's about leaving ordered extents which
can make any task get them and hang on them forever.

thanks

>
> Thanks,
> Qu
>
>>
>>>
>>> In that case, since we haven't unlock pages/extent, there will no race
>>> nor
>>> new ordered extent, and we are ensured to have only one ordered extent
>>> need
>>> cleanup.
>>>
>>>>
>>>> Also, for any created ordered extent, you want to set the bit
>>>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>>>> concurrent tasks waiting for it that do error handling (like an fsync,
>>>> since when running delalloc the inode isn't locked).
>>>
>>>
>>>
>>> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called
>>> for
>>> DATA_RELOC tree inode, which we don't call fsync on it.
>>>
>>>>
>>>> Look at the direct IO write path error handling and
>>>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>>>> And the same problem happens in the cow case (inode.c:cow_file_range()).
>>>
>>>
>>>
>>> BTW, I have already done it in cow_file_range(), check the beginning
>>> lines
>>> of inode.c of this patch.
>>
>>
>> Indeed. I missed it.
>>
>> Thanks.
>>
>>>
>>> Thanks,
>>> Qu
>>>
>>>>
>>>> Thanks.
>>>>
>>>>> +       }
>>>>>         btrfs_free_path(path);
>>>>>         return ret;
>>>>>  }
>>>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>>>> index 041c3326d109..dba1cf3464a7 100644
>>>>> --- a/fs/btrfs/ordered-data.c
>>>>> +++ b/fs/btrfs/ordered-data.c
>>>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>>>> *inode,
>>>>>         wake_up(&entry->wait);
>>>>>  }
>>>>>
>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>>> +                               u64 ram_len)
>>>>> +{
>>>>> +       struct btrfs_ordered_extent *entry;
>>>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>>>> +
>>>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset,
>>>>> ram_len);
>>>>> +       if (!entry || entry->file_offset != file_offset ||
>>>>> +           entry->len != ram_len)
>>>>> +               goto not_found;
>>>>> +
>>>>> +       /* Same as btrfs_finish_ordered_io() */
>>>>> +       btrfs_remove_ordered_extent(inode, entry);
>>>>> +       btrfs_put_ordered_extent(entry);
>>>>> +       btrfs_put_ordered_extent(entry);
>>>>> +       return;
>>>>> +
>>>>> +not_found:
>>>>> +       WARN_ON(1);
>>>>> +       btrfs_err(root->fs_info,
>>>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>>>> file_offset %llu len %llu",
>>>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>>>> ram_len);
>>>>> +       return;
>>>>> +}
>>>>> +
>>>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>>>  {
>>>>>         struct btrfs_ordered_extent *ordered;
>>>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>>>> index 5f2b0ca28705..7a989778aa89 100644
>>>>> --- a/fs/btrfs/ordered-data.h
>>>>> +++ b/fs/btrfs/ordered-data.h
>>>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>>>> btrfs_ordered_inode_tree *t)
>>>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>>>                                 struct btrfs_ordered_extent *entry);
>>>>> +
>>>>> +/*
>>>>> + * Function to cleanup an allocated ordered extent in error routine.
>>>>> + *
>>>>> + * As error handler in run_delalloc_range() will clear all related
>>>>> pages
>>>>> + * and skip their IO, we have no method to finish inserted ordered
>>>>> extent.
>>>>> + * So we must use this function to clean it up.
>>>>> + */
>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>>> +                               u64 ram_len);
>>>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>>>                                    struct btrfs_ordered_extent
>>>>> **cached,
>>>>>                                    u64 file_offset, u64 io_size, int
>>>>> uptodate);
>>>>> --
>>>>> 2.11.1
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
>>>>> in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>>
>
>
Qu Wenruo Feb. 20, 2017, 12:31 a.m. UTC | #6
At 02/17/2017 11:25 PM, Filipe Manana wrote:
> On Fri, Feb 17, 2017 at 12:37 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>>
>>
>> At 02/16/2017 06:03 PM, Filipe Manana wrote:
>>>
>>> On Thu, Feb 16, 2017 at 12:39 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>> wrote:
>>>>
>>>>
>>>>
>>>> At 02/15/2017 11:59 PM, Filipe Manana wrote:
>>>>>
>>>>>
>>>>> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>> wrote:
>>>>>>
>>>>>>
>>>>>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>>>>>> btrfs will block with the following backtrace:
>>>>>>
>>>>>> Call Trace:
>>>>>>  __schedule+0x2d4/0xae0
>>>>>>  schedule+0x3d/0x90
>>>>>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>>>>>  ? wake_atomic_t_function+0x60/0x60
>>>>>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>>>>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>>>>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>>>>>  process_one_work+0x2af/0x720
>>>>>>  ? process_one_work+0x22b/0x720
>>>>>>  worker_thread+0x4b/0x4f0
>>>>>>  kthread+0x10f/0x150
>>>>>>  ? process_one_work+0x720/0x720
>>>>>>  ? kthread_create_on_node+0x40/0x40
>>>>>>  ret_from_fork+0x2e/0x40
>>>>>>
>>>>>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>>>>>> handle error from btrfs_reloc_clone_csums() well.
>>>>>>
>>>>>> The related part call path will be:
>>>>>> __extent_writepage
>>>>>> |- writepage_delalloc()
>>>>>> |  |- run_delalloc_range()
>>>>>> |     |- run_delalloc_nocow()
>>>>>> |        |- btrfs_add_ordered_extent()
>>>>>> |        |  Now one ordered extent for file range, e.g [0, 1M) is
>>>>>> inserted
>>>>>> |        |
>>>>>> |        |- btrfs_reloc_clone_csums()
>>>>>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>>>>>> |        |  blocks
>>>>>> |        |
>>>>>> |        |- extent_clear_unlock_delalloc()
>>>>>> |           Error routine, unlock and clear page DIRTY, end page
>>>>>> writeback
>>>>>> |           So the remaining 255 pages will not go through writeback
>>>>>> |
>>>>>> |- __extent_writepage_io()
>>>>>>    |- writepage_end_io_hook()
>>>>>>       |- btrfs_dev_test_ordered_pending()
>>>>>>          Reduce ordered_extent->bytes_left by 4K.
>>>>>>          Still have (1M - 4K) to finish.
>>>>>>
>>>>>> While the remaining 255 pages will not go through IO nor trigger
>>>>>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>>>>>> never finish, and blocking current transaction forever.
>>>>>>
>>>>>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>>>>>> error routine first.
>>>>>>
>>>>>> This patch will cleanup the ordered extent in error routine, so at
>>>>>> least
>>>>>> we won't cause deadlock.
>>>>>>
>>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>> ---
>>>>>>  fs/btrfs/extent_io.c    |  1 -
>>>>>>  fs/btrfs/inode.c        | 10 ++++++++--
>>>>>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>>>>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>>>>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>>>> index 4ac383a3a649..a14d1b0840c5 100644
>>>>>> --- a/fs/btrfs/extent_io.c
>>>>>> +++ b/fs/btrfs/extent_io.c
>>>>>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int
>>>>>> writepage_delalloc(struct inode *inode,
>>>>>>                                                delalloc_end,
>>>>>>                                                &page_started,
>>>>>>                                                nr_written);
>>>>>> -               /* File system has been set read-only */
>>>>>>                 if (ret) {
>>>>>>                         SetPageError(page);
>>>>>>                         /* fill_delalloc should be return < 0 for error
>>>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>>>> index 1e861a063721..3c3ade58afd7 100644
>>>>>> --- a/fs/btrfs/inode.c
>>>>>> +++ b/fs/btrfs/inode.c
>>>>>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode
>>>>>> *inode,
>>>>>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>>>>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>>>>>                                                       cur_alloc_size);
>>>>>> -                       if (ret)
>>>>>> +                       if (ret) {
>>>>>> +                               btrfs_clean_ordered_extent(inode,
>>>>>> start,
>>>>>> +                                                          ram_size);
>>>>>>                                 goto out_drop_extent_cache;
>>>>>> +                       }
>>>>>>                 }
>>>>>>
>>>>>>                 btrfs_dec_block_group_reservations(fs_info,
>>>>>> ins.objectid);
>>>>>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct
>>>>>> inode
>>>>>> *inode,
>>>>>>         if (!ret)
>>>>>>                 ret = err;
>>>>>>
>>>>>> -       if (ret && cur_offset < end)
>>>>>> +       if (ret && cur_offset < end) {
>>>>>>                 extent_clear_unlock_delalloc(inode, cur_offset, end,
>>>>>> end,
>>>>>>                                              locked_page, EXTENT_LOCKED
>>>>>> |
>>>>>>                                              EXTENT_DELALLOC |
>>>>>> EXTENT_DEFRAG |
>>>>>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct
>>>>>> inode
>>>>>> *inode,
>>>>>>                                              PAGE_CLEAR_DIRTY |
>>>>>>                                              PAGE_SET_WRITEBACK |
>>>>>>                                              PAGE_END_WRITEBACK);
>>>>>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>>>>>> +                                          end - cur_offset + 1);
>>>>>
>>>>>
>>>>>
>>>>> So this is partially correct only.
>>>>> First here you can have 0, 1 or more ordered extents that were created
>>>>> in the while loop. So your new function must find and process all
>>>>> ordered extents within the delalloc range and not just the last one
>>>>> that was created.
>>>>
>>>>
>>>>
>>>> OK, I found that only btrfs_reloc_clone_csums() will need to cleanup
>>>> ordered
>>>> extent, and there is no other case which can cause error after adding
>>>> ordered extent.
>>>>
>>>> So what about moving the btrfs_clean_ordered_extent() call to just after
>>>> btrfs_reloc_clone_csums() instead of putting it under error tag?
>>>
>>>
>>> Unfortunately it's not as simple as that, because when it fails, past
>>> iterations of the while loop have created other ordered extents.
>>> Same goes for cow_file_range().
>>>
>>> So we really need something like what is done for the error path of
>>> dio writes, which gets rid of all ordered extents and sets the IO
>>> error bit on them as well.
>>
>>
>> The IO error bit makes sense.
>>
>> While I'm still not sure the necessity to cleanup previously created ordered
>> extent, since they could finish using endio functions.
>
> In case of an error the endio handler is called only for the first
> page, and not for whole range for which one or more ordered extents
> were created. So the ordered extents will be around forever.

That's the same thing I'm trying to fix.

While I'm asking the reason the cleanup previously created ordered 
extent, which didn't encounter any error.


>
>>
>> Or is this a designed behavior for things like fsync?
>> Success for all or failure if any fails?
>
> This is unrelated to fsync. It's about leaving ordered extents which
> can make any task get them and hang on them forever.

However in the case of btrfs_reloc_clone_csum(), if it fails, there is 
only one ordered extent really needs to clean.

Previously created ordered extent can finish without problem, so we only 
need to clean the last created one, and no need to cleanup all ordered 
extent that it created.

For a more specific example of this case:

We are relocating one data block group, whose size is 1G, containing 8 
128M file extents.

For run_delalloc_nocow(), it's called on root DATA_RELOC inode 257, 
range is [0,1G),

And for first loop, it handles the first [0, 128M) without problem.
1st ordered extent is submitted and can finish even before submitting 
2nd ordered extent.

Then when we are submitting 2nd ordered extent, ranged from [128M, 
256M), btrfs_reloc_clone_csum() fails. we must cleanup at least 2nd 
ordered extent or it will hang forever.

If I understand your right, you mean we must cleanup all ordered extent 
in the range [0, 1G).

My point is, since 1st ordered extent can be submitted and finished 
before 2nd ordered extent submission without problem, I didn't see the 
point to cleanup 1st ordered extent and non-exist 3rd~8th ordered extent.

Or I missed or misunderstand something else?

Thanks,
Qu



>
> thanks
>
>>
>> Thanks,
>> Qu
>>
>>>
>>>>
>>>> In that case, since we haven't unlock pages/extent, there will no race
>>>> nor
>>>> new ordered extent, and we are ensured to have only one ordered extent
>>>> need
>>>> cleanup.
>>>>
>>>>>
>>>>> Also, for any created ordered extent, you want to set the bit
>>>>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>>>>> concurrent tasks waiting for it that do error handling (like an fsync,
>>>>> since when running delalloc the inode isn't locked).
>>>>
>>>>
>>>>
>>>> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called
>>>> for
>>>> DATA_RELOC tree inode, which we don't call fsync on it.
>>>>
>>>>>
>>>>> Look at the direct IO write path error handling and
>>>>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>>>>> And the same problem happens in the cow case (inode.c:cow_file_range()).
>>>>
>>>>
>>>>
>>>> BTW, I have already done it in cow_file_range(), check the beginning
>>>> lines
>>>> of inode.c of this patch.
>>>
>>>
>>> Indeed. I missed it.
>>>
>>> Thanks.
>>>
>>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>> Thanks.
>>>>>
>>>>>> +       }
>>>>>>         btrfs_free_path(path);
>>>>>>         return ret;
>>>>>>  }
>>>>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>>>>> index 041c3326d109..dba1cf3464a7 100644
>>>>>> --- a/fs/btrfs/ordered-data.c
>>>>>> +++ b/fs/btrfs/ordered-data.c
>>>>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>>>>> *inode,
>>>>>>         wake_up(&entry->wait);
>>>>>>  }
>>>>>>
>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>>>> +                               u64 ram_len)
>>>>>> +{
>>>>>> +       struct btrfs_ordered_extent *entry;
>>>>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>>>>> +
>>>>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset,
>>>>>> ram_len);
>>>>>> +       if (!entry || entry->file_offset != file_offset ||
>>>>>> +           entry->len != ram_len)
>>>>>> +               goto not_found;
>>>>>> +
>>>>>> +       /* Same as btrfs_finish_ordered_io() */
>>>>>> +       btrfs_remove_ordered_extent(inode, entry);
>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>> +       return;
>>>>>> +
>>>>>> +not_found:
>>>>>> +       WARN_ON(1);
>>>>>> +       btrfs_err(root->fs_info,
>>>>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>>>>> file_offset %llu len %llu",
>>>>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>>>>> ram_len);
>>>>>> +       return;
>>>>>> +}
>>>>>> +
>>>>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>>>>  {
>>>>>>         struct btrfs_ordered_extent *ordered;
>>>>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>>>>> index 5f2b0ca28705..7a989778aa89 100644
>>>>>> --- a/fs/btrfs/ordered-data.h
>>>>>> +++ b/fs/btrfs/ordered-data.h
>>>>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>>>>> btrfs_ordered_inode_tree *t)
>>>>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>>>>                                 struct btrfs_ordered_extent *entry);
>>>>>> +
>>>>>> +/*
>>>>>> + * Function to cleanup an allocated ordered extent in error routine.
>>>>>> + *
>>>>>> + * As error handler in run_delalloc_range() will clear all related
>>>>>> pages
>>>>>> + * and skip their IO, we have no method to finish inserted ordered
>>>>>> extent.
>>>>>> + * So we must use this function to clean it up.
>>>>>> + */
>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
>>>>>> +                               u64 ram_len);
>>>>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>>>>                                    struct btrfs_ordered_extent
>>>>>> **cached,
>>>>>>                                    u64 file_offset, u64 io_size, int
>>>>>> uptodate);
>>>>>> --
>>>>>> 2.11.1
>>>>>>
>>>>>>
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
>>>>>> in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>>
>>
>>
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana Feb. 20, 2017, 1:43 p.m. UTC | #7
On Mon, Feb 20, 2017 at 12:31 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>
>
> At 02/17/2017 11:25 PM, Filipe Manana wrote:
>>
>> On Fri, Feb 17, 2017 at 12:37 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>> wrote:
>>>
>>>
>>>
>>> At 02/16/2017 06:03 PM, Filipe Manana wrote:
>>>>
>>>>
>>>> On Thu, Feb 16, 2017 at 12:39 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> wrote:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> At 02/15/2017 11:59 PM, Filipe Manana wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On Wed, Feb 15, 2017 at 8:49 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
>>>>>>> btrfs will block with the following backtrace:
>>>>>>>
>>>>>>> Call Trace:
>>>>>>>  __schedule+0x2d4/0xae0
>>>>>>>  schedule+0x3d/0x90
>>>>>>>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>>>>>>>  ? wake_atomic_t_function+0x60/0x60
>>>>>>>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>>>>>>>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>>>>>>>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>>>>>>>  process_one_work+0x2af/0x720
>>>>>>>  ? process_one_work+0x22b/0x720
>>>>>>>  worker_thread+0x4b/0x4f0
>>>>>>>  kthread+0x10f/0x150
>>>>>>>  ? process_one_work+0x720/0x720
>>>>>>>  ? kthread_create_on_node+0x40/0x40
>>>>>>>  ret_from_fork+0x2e/0x40
>>>>>>>
>>>>>>> The direct cause is the error handler in run_delalloc_nocow() doesn't
>>>>>>> handle error from btrfs_reloc_clone_csums() well.
>>>>>>>
>>>>>>> The related part call path will be:
>>>>>>> __extent_writepage
>>>>>>> |- writepage_delalloc()
>>>>>>> |  |- run_delalloc_range()
>>>>>>> |     |- run_delalloc_nocow()
>>>>>>> |        |- btrfs_add_ordered_extent()
>>>>>>> |        |  Now one ordered extent for file range, e.g [0, 1M) is
>>>>>>> inserted
>>>>>>> |        |
>>>>>>> |        |- btrfs_reloc_clone_csums()
>>>>>>> |        |  Fails with -EIO, as RAID5/6 doesn't repair some csum tree
>>>>>>> |        |  blocks
>>>>>>> |        |
>>>>>>> |        |- extent_clear_unlock_delalloc()
>>>>>>> |           Error routine, unlock and clear page DIRTY, end page
>>>>>>> writeback
>>>>>>> |           So the remaining 255 pages will not go through writeback
>>>>>>> |
>>>>>>> |- __extent_writepage_io()
>>>>>>>    |- writepage_end_io_hook()
>>>>>>>       |- btrfs_dev_test_ordered_pending()
>>>>>>>          Reduce ordered_extent->bytes_left by 4K.
>>>>>>>          Still have (1M - 4K) to finish.
>>>>>>>
>>>>>>> While the remaining 255 pages will not go through IO nor trigger
>>>>>>> writepage_end_io_hook(), the ordered extent for [0, 1M) will
>>>>>>> never finish, and blocking current transaction forever.
>>>>>>>
>>>>>>> Although the root cause is still in RAID5/6, it won't hurt to fix the
>>>>>>> error routine first.
>>>>>>>
>>>>>>> This patch will cleanup the ordered extent in error routine, so at
>>>>>>> least
>>>>>>> we won't cause deadlock.
>>>>>>>
>>>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>>> ---
>>>>>>>  fs/btrfs/extent_io.c    |  1 -
>>>>>>>  fs/btrfs/inode.c        | 10 ++++++++--
>>>>>>>  fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
>>>>>>>  fs/btrfs/ordered-data.h | 10 ++++++++++
>>>>>>>  4 files changed, 43 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>>>>> index 4ac383a3a649..a14d1b0840c5 100644
>>>>>>> --- a/fs/btrfs/extent_io.c
>>>>>>> +++ b/fs/btrfs/extent_io.c
>>>>>>> @@ -3258,7 +3258,6 @@ static noinline_for_stack int
>>>>>>> writepage_delalloc(struct inode *inode,
>>>>>>>                                                delalloc_end,
>>>>>>>                                                &page_started,
>>>>>>>                                                nr_written);
>>>>>>> -               /* File system has been set read-only */
>>>>>>>                 if (ret) {
>>>>>>>                         SetPageError(page);
>>>>>>>                         /* fill_delalloc should be return < 0 for
>>>>>>> error
>>>>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>>>>> index 1e861a063721..3c3ade58afd7 100644
>>>>>>> --- a/fs/btrfs/inode.c
>>>>>>> +++ b/fs/btrfs/inode.c
>>>>>>> @@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct
>>>>>>> inode
>>>>>>> *inode,
>>>>>>>                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
>>>>>>>                         ret = btrfs_reloc_clone_csums(inode, start,
>>>>>>>
>>>>>>> cur_alloc_size);
>>>>>>> -                       if (ret)
>>>>>>> +                       if (ret) {
>>>>>>> +                               btrfs_clean_ordered_extent(inode,
>>>>>>> start,
>>>>>>> +                                                          ram_size);
>>>>>>>                                 goto out_drop_extent_cache;
>>>>>>> +                       }
>>>>>>>                 }
>>>>>>>
>>>>>>>                 btrfs_dec_block_group_reservations(fs_info,
>>>>>>> ins.objectid);
>>>>>>> @@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct
>>>>>>> inode
>>>>>>> *inode,
>>>>>>>         if (!ret)
>>>>>>>                 ret = err;
>>>>>>>
>>>>>>> -       if (ret && cur_offset < end)
>>>>>>> +       if (ret && cur_offset < end) {
>>>>>>>                 extent_clear_unlock_delalloc(inode, cur_offset, end,
>>>>>>> end,
>>>>>>>                                              locked_page,
>>>>>>> EXTENT_LOCKED
>>>>>>> |
>>>>>>>                                              EXTENT_DELALLOC |
>>>>>>> EXTENT_DEFRAG |
>>>>>>> @@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct
>>>>>>> inode
>>>>>>> *inode,
>>>>>>>                                              PAGE_CLEAR_DIRTY |
>>>>>>>                                              PAGE_SET_WRITEBACK |
>>>>>>>                                              PAGE_END_WRITEBACK);
>>>>>>> +               btrfs_clean_ordered_extent(inode, cur_offset,
>>>>>>> +                                          end - cur_offset + 1);
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> So this is partially correct only.
>>>>>> First here you can have 0, 1 or more ordered extents that were created
>>>>>> in the while loop. So your new function must find and process all
>>>>>> ordered extents within the delalloc range and not just the last one
>>>>>> that was created.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> OK, I found that only btrfs_reloc_clone_csums() will need to cleanup
>>>>> ordered
>>>>> extent, and there is no other case which can cause error after adding
>>>>> ordered extent.
>>>>>
>>>>> So what about moving the btrfs_clean_ordered_extent() call to just
>>>>> after
>>>>> btrfs_reloc_clone_csums() instead of putting it under error tag?
>>>>
>>>>
>>>>
>>>> Unfortunately it's not as simple as that, because when it fails, past
>>>> iterations of the while loop have created other ordered extents.
>>>> Same goes for cow_file_range().
>>>>
>>>> So we really need something like what is done for the error path of
>>>> dio writes, which gets rid of all ordered extents and sets the IO
>>>> error bit on them as well.
>>>
>>>
>>>
>>> The IO error bit makes sense.
>>>
>>> While I'm still not sure the necessity to cleanup previously created
>>> ordered
>>> extent, since they could finish using endio functions.
>>
>>
>> In case of an error the endio handler is called only for the first
>> page, and not for whole range for which one or more ordered extents
>> were created. So the ordered extents will be around forever.
>
>
> That's the same thing I'm trying to fix.
>
> While I'm asking the reason the cleanup previously created ordered extent,
> which didn't encounter any error.

Any error in the loop requires cleaning up previously created ordered
extents, because if that function returns an error no bio is submitted
and therefore the ordered extents won't be removed, except for the
case where only one ordered extent is created and the delalloc range
is a single page.

>
>
>>
>>>
>>> Or is this a designed behavior for things like fsync?
>>> Success for all or failure if any fails?
>>
>>
>> This is unrelated to fsync. It's about leaving ordered extents which
>> can make any task get them and hang on them forever.
>
>
> However in the case of btrfs_reloc_clone_csum(), if it fails, there is only
> one ordered extent really needs to clean.

The problem is more generic and you can forget about
btrfs_reloc_clone_csum(). It's about the delalloc callback returning
an error and one or more ordered extents were created by past
iterations of the loop before the error happened.

>
> Previously created ordered extent can finish without problem, so we only
> need to clean the last created one, and no need to cleanup all ordered
> extent that it created.
>
> For a more specific example of this case:
>
> We are relocating one data block group, whose size is 1G, containing 8 128M
> file extents.
>
> For run_delalloc_nocow(), it's called on root DATA_RELOC inode 257, range is
> [0,1G),
>
> And for first loop, it handles the first [0, 128M) without problem.
> 1st ordered extent is submitted and can finish even before submitting 2nd
> ordered extent.

The ordered extent is submitted but not a bio for it. It's when the
bio completes (writeback finishes) that the endio callback is invoked,
which wakes up every task waiting on the ordered extent, sets IO
errors bit if needed, removes the ordered extent, etc.

>
> Then when we are submitting 2nd ordered extent, ranged from [128M, 256M),
> btrfs_reloc_clone_csum() fails. we must cleanup at least 2nd ordered extent
> or it will hang forever.
>
> If I understand your right, you mean we must cleanup all ordered extent in
> the range [0, 1G).

Yes.

>
> My point is, since 1st ordered extent can be submitted and finished before
> 2nd ordered extent submission without problem, I didn't see the point to
> cleanup 1st ordered extent and non-exist 3rd~8th ordered extent.

As said before in this reply, if the delalloc callback
(cow_file_range, run_dellaloc_nocow) returns an error, no bios are
submitted for any ordered extents it might have created before an
error happened. Check __extent_writepage() - it calls
writepage_delalloc() which calls our delalloc callback, and if that
returns an error, it won't call __extent_writepage_io(), which is what
submits bios.

>
> Or I missed or misunderstand something else?

You missed a lot of things I suppose, namely that bios aren't
submitted if the dealloc function returns an error.

Honestly I don't know how to explain things better to you, and was
hopping this was easier to understand using the direct IO write error
path as an example.

thanks

>
> Thanks,
> Qu
>
>
>
>
>>
>> thanks
>>
>>>
>>> Thanks,
>>> Qu
>>>
>>>>
>>>>>
>>>>> In that case, since we haven't unlock pages/extent, there will no race
>>>>> nor
>>>>> new ordered extent, and we are ensured to have only one ordered extent
>>>>> need
>>>>> cleanup.
>>>>>
>>>>>>
>>>>>> Also, for any created ordered extent, you want to set the bit
>>>>>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>>>>>> concurrent tasks waiting for it that do error handling (like an fsync,
>>>>>> since when running delalloc the inode isn't locked).
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called
>>>>> for
>>>>> DATA_RELOC tree inode, which we don't call fsync on it.
>>>>>
>>>>>>
>>>>>> Look at the direct IO write path error handling and
>>>>>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>>>>>> And the same problem happens in the cow case
>>>>>> (inode.c:cow_file_range()).
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> BTW, I have already done it in cow_file_range(), check the beginning
>>>>> lines
>>>>> of inode.c of this patch.
>>>>
>>>>
>>>>
>>>> Indeed. I missed it.
>>>>
>>>> Thanks.
>>>>
>>>>>
>>>>> Thanks,
>>>>> Qu
>>>>>
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>>> +       }
>>>>>>>         btrfs_free_path(path);
>>>>>>>         return ret;
>>>>>>>  }
>>>>>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>>>>>> index 041c3326d109..dba1cf3464a7 100644
>>>>>>> --- a/fs/btrfs/ordered-data.c
>>>>>>> +++ b/fs/btrfs/ordered-data.c
>>>>>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>>>>>> *inode,
>>>>>>>         wake_up(&entry->wait);
>>>>>>>  }
>>>>>>>
>>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64
>>>>>>> file_offset,
>>>>>>> +                               u64 ram_len)
>>>>>>> +{
>>>>>>> +       struct btrfs_ordered_extent *entry;
>>>>>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>>>>>> +
>>>>>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset,
>>>>>>> ram_len);
>>>>>>> +       if (!entry || entry->file_offset != file_offset ||
>>>>>>> +           entry->len != ram_len)
>>>>>>> +               goto not_found;
>>>>>>> +
>>>>>>> +       /* Same as btrfs_finish_ordered_io() */
>>>>>>> +       btrfs_remove_ordered_extent(inode, entry);
>>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>>> +       return;
>>>>>>> +
>>>>>>> +not_found:
>>>>>>> +       WARN_ON(1);
>>>>>>> +       btrfs_err(root->fs_info,
>>>>>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>>>>>> file_offset %llu len %llu",
>>>>>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>>>>>> ram_len);
>>>>>>> +       return;
>>>>>>> +}
>>>>>>> +
>>>>>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>>>>>  {
>>>>>>>         struct btrfs_ordered_extent *ordered;
>>>>>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>>>>>> index 5f2b0ca28705..7a989778aa89 100644
>>>>>>> --- a/fs/btrfs/ordered-data.h
>>>>>>> +++ b/fs/btrfs/ordered-data.h
>>>>>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>>>>>> btrfs_ordered_inode_tree *t)
>>>>>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>>>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>>>>>                                 struct btrfs_ordered_extent *entry);
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Function to cleanup an allocated ordered extent in error routine.
>>>>>>> + *
>>>>>>> + * As error handler in run_delalloc_range() will clear all related
>>>>>>> pages
>>>>>>> + * and skip their IO, we have no method to finish inserted ordered
>>>>>>> extent.
>>>>>>> + * So we must use this function to clean it up.
>>>>>>> + */
>>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64
>>>>>>> file_offset,
>>>>>>> +                               u64 ram_len);
>>>>>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>>>>>                                    struct btrfs_ordered_extent
>>>>>>> **cached,
>>>>>>>                                    u64 file_offset, u64 io_size, int
>>>>>>> uptodate);
>>>>>>> --
>>>>>>> 2.11.1
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> --
>>>>>>> To unsubscribe from this list: send the line "unsubscribe
>>>>>>> linux-btrfs"
>>>>>>> in
>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>>
>
>
Qu Wenruo Feb. 21, 2017, 12:25 a.m. UTC | #8
At 02/20/2017 09:43 PM, Filipe Manana wrote:
> On Mon, Feb 20, 2017 at 12:31 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>>
>>
>> At 02/17/2017 11:25 PM, Filipe Manana wrote:
>>>
>>> On Fri, Feb 17, 2017 at 12:37 AM, Qu Wenruo <quwenruo@cn.fujitsu.com>
>>> wrote:
>>>
>>>
>>> In case of an error the endio handler is called only for the first
>>> page, and not for whole range for which one or more ordered extents
>>> were created. So the ordered extents will be around forever.
>>
>>
>> That's the same thing I'm trying to fix.
>>
>> While I'm asking the reason the cleanup previously created ordered extent,
>> which didn't encounter any error.
>
> Any error in the loop requires cleaning up previously created ordered
> extents, because if that function returns an error no bio is submitted
> and therefore the ordered extents won't be removed, except for the
> case where only one ordered extent is created and the delalloc range
> is a single page.
>
>>
>>
>>>
>>>>
>>>> Or is this a designed behavior for things like fsync?
>>>> Success for all or failure if any fails?
>>>
>>>
>>> This is unrelated to fsync. It's about leaving ordered extents which
>>> can make any task get them and hang on them forever.
>>
>>
>> However in the case of btrfs_reloc_clone_csum(), if it fails, there is only
>> one ordered extent really needs to clean.
>
> The problem is more generic and you can forget about
> btrfs_reloc_clone_csum(). It's about the delalloc callback returning
> an error and one or more ordered extents were created by past
> iterations of the loop before the error happened.

While the only possible problem we can hit after submitting an ordered 
extent is btrfs_reloc_clone_csum().

>
>>
>> Previously created ordered extent can finish without problem, so we only
>> need to clean the last created one, and no need to cleanup all ordered
>> extent that it created.
>>
>> For a more specific example of this case:
>>
>> We are relocating one data block group, whose size is 1G, containing 8 128M
>> file extents.
>>
>> For run_delalloc_nocow(), it's called on root DATA_RELOC inode 257, range is
>> [0,1G),
>>
>> And for first loop, it handles the first [0, 128M) without problem.
>> 1st ordered extent is submitted and can finish even before submitting 2nd
>> ordered extent.
>
> The ordered extent is submitted but not a bio for it. It's when the
> bio completes (writeback finishes) that the endio callback is invoked,
> which wakes up every task waiting on the ordered extent, sets IO
> errors bit if needed, removes the ordered extent, etc.
>
>>
>> Then when we are submitting 2nd ordered extent, ranged from [128M, 256M),
>> btrfs_reloc_clone_csum() fails. we must cleanup at least 2nd ordered extent
>> or it will hang forever.
>>
>> If I understand your right, you mean we must cleanup all ordered extent in
>> the range [0, 1G).
>
> Yes.
>
>>
>> My point is, since 1st ordered extent can be submitted and finished before
>> 2nd ordered extent submission without problem, I didn't see the point to
>> cleanup 1st ordered extent and non-exist 3rd~8th ordered extent.
>
> As said before in this reply, if the delalloc callback
> (cow_file_range, run_dellaloc_nocow) returns an error, no bios are
> submitted for any ordered extents it might have created before an
> error happened. Check __extent_writepage() - it calls
> writepage_delalloc() which calls our delalloc callback, and if that
> returns an error, it won't call __extent_writepage_io(), which is what
> submits bios.
>
>>
>> Or I missed or misunderstand something else?
>
> You missed a lot of things I suppose, namely that bios aren't
> submitted if the dealloc function returns an error.

That's the point. I misunderstand the bio submission timing, so in that 
case you're right about cleanup the whole range.

I'll update the patch to address it.

Thanks,
Qu

>
> Honestly I don't know how to explain things better to you, and was
> hopping this was easier to understand using the direct IO write error
> path as an example.
>
> thanks
>
>>
>> Thanks,
>> Qu
>>
>>
>>
>>
>>>
>>> thanks
>>>
>>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>>>
>>>>>> In that case, since we haven't unlock pages/extent, there will no race
>>>>>> nor
>>>>>> new ordered extent, and we are ensured to have only one ordered extent
>>>>>> need
>>>>>> cleanup.
>>>>>>
>>>>>>>
>>>>>>> Also, for any created ordered extent, you want to set the bit
>>>>>>> BTRFS_ORDERED_IOERR on it before removing it, since there might be
>>>>>>> concurrent tasks waiting for it that do error handling (like an fsync,
>>>>>>> since when running delalloc the inode isn't locked).
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Fsync is not affected IIRC, as btrfs_reloc_clone_csum() is only called
>>>>>> for
>>>>>> DATA_RELOC tree inode, which we don't call fsync on it.
>>>>>>
>>>>>>>
>>>>>>> Look at the direct IO write path error handling and
>>>>>>> btrfs_endio_direct_write_update_ordered() for a very similar scenario.
>>>>>>> And the same problem happens in the cow case
>>>>>>> (inode.c:cow_file_range()).
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> BTW, I have already done it in cow_file_range(), check the beginning
>>>>>> lines
>>>>>> of inode.c of this patch.
>>>>>
>>>>>
>>>>>
>>>>> Indeed. I missed it.
>>>>>
>>>>> Thanks.
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Qu
>>>>>>
>>>>>>>
>>>>>>> Thanks.
>>>>>>>
>>>>>>>> +       }
>>>>>>>>         btrfs_free_path(path);
>>>>>>>>         return ret;
>>>>>>>>  }
>>>>>>>> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
>>>>>>>> index 041c3326d109..dba1cf3464a7 100644
>>>>>>>> --- a/fs/btrfs/ordered-data.c
>>>>>>>> +++ b/fs/btrfs/ordered-data.c
>>>>>>>> @@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode
>>>>>>>> *inode,
>>>>>>>>         wake_up(&entry->wait);
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64
>>>>>>>> file_offset,
>>>>>>>> +                               u64 ram_len)
>>>>>>>> +{
>>>>>>>> +       struct btrfs_ordered_extent *entry;
>>>>>>>> +       struct btrfs_root *root = BTRFS_I(inode)->root;
>>>>>>>> +
>>>>>>>> +       entry = btrfs_lookup_ordered_range(inode, file_offset,
>>>>>>>> ram_len);
>>>>>>>> +       if (!entry || entry->file_offset != file_offset ||
>>>>>>>> +           entry->len != ram_len)
>>>>>>>> +               goto not_found;
>>>>>>>> +
>>>>>>>> +       /* Same as btrfs_finish_ordered_io() */
>>>>>>>> +       btrfs_remove_ordered_extent(inode, entry);
>>>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>>>> +       btrfs_put_ordered_extent(entry);
>>>>>>>> +       return;
>>>>>>>> +
>>>>>>>> +not_found:
>>>>>>>> +       WARN_ON(1);
>>>>>>>> +       btrfs_err(root->fs_info,
>>>>>>>> +       "failed to find and clean ordered extent: root %llu ino %llu
>>>>>>>> file_offset %llu len %llu",
>>>>>>>> +                 root->objectid, btrfs_ino(inode), file_offset,
>>>>>>>> ram_len);
>>>>>>>> +       return;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
>>>>>>>>  {
>>>>>>>>         struct btrfs_ordered_extent *ordered;
>>>>>>>> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
>>>>>>>> index 5f2b0ca28705..7a989778aa89 100644
>>>>>>>> --- a/fs/btrfs/ordered-data.h
>>>>>>>> +++ b/fs/btrfs/ordered-data.h
>>>>>>>> @@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
>>>>>>>> btrfs_ordered_inode_tree *t)
>>>>>>>>  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
>>>>>>>>  void btrfs_remove_ordered_extent(struct inode *inode,
>>>>>>>>                                 struct btrfs_ordered_extent *entry);
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Function to cleanup an allocated ordered extent in error routine.
>>>>>>>> + *
>>>>>>>> + * As error handler in run_delalloc_range() will clear all related
>>>>>>>> pages
>>>>>>>> + * and skip their IO, we have no method to finish inserted ordered
>>>>>>>> extent.
>>>>>>>> + * So we must use this function to clean it up.
>>>>>>>> + */
>>>>>>>> +void btrfs_clean_ordered_extent(struct inode *inode, u64
>>>>>>>> file_offset,
>>>>>>>> +                               u64 ram_len);
>>>>>>>>  int btrfs_dec_test_ordered_pending(struct inode *inode,
>>>>>>>>                                    struct btrfs_ordered_extent
>>>>>>>> **cached,
>>>>>>>>                                    u64 file_offset, u64 io_size, int
>>>>>>>> uptodate);
>>>>>>>> --
>>>>>>>> 2.11.1
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> --
>>>>>>>> To unsubscribe from this list: send the line "unsubscribe
>>>>>>>> linux-btrfs"
>>>>>>>> in
>>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>>
>>
>>
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ac383a3a649..a14d1b0840c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3258,7 +3258,6 @@  static noinline_for_stack int writepage_delalloc(struct inode *inode,
 					       delalloc_end,
 					       &page_started,
 					       nr_written);
-		/* File system has been set read-only */
 		if (ret) {
 			SetPageError(page);
 			/* fill_delalloc should be return < 0 for error
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..3c3ade58afd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1052,8 +1052,11 @@  static noinline int cow_file_range(struct inode *inode,
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, start,
 						      cur_alloc_size);
-			if (ret)
+			if (ret) {
+				btrfs_clean_ordered_extent(inode, start,
+							   ram_size);
 				goto out_drop_extent_cache;
+			}
 		}
 
 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1538,7 +1541,7 @@  static noinline int run_delalloc_nocow(struct inode *inode,
 	if (!ret)
 		ret = err;
 
-	if (ret && cur_offset < end)
+	if (ret && cur_offset < end) {
 		extent_clear_unlock_delalloc(inode, cur_offset, end, end,
 					     locked_page, EXTENT_LOCKED |
 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
@@ -1546,6 +1549,9 @@  static noinline int run_delalloc_nocow(struct inode *inode,
 					     PAGE_CLEAR_DIRTY |
 					     PAGE_SET_WRITEBACK |
 					     PAGE_END_WRITEBACK);
+		btrfs_clean_ordered_extent(inode, cur_offset,
+					   end - cur_offset + 1);
+	}
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 041c3326d109..dba1cf3464a7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -650,6 +650,31 @@  void btrfs_remove_ordered_extent(struct inode *inode,
 	wake_up(&entry->wait);
 }
 
+void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
+				u64 ram_len)
+{
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
+	if (!entry || entry->file_offset != file_offset ||
+	    entry->len != ram_len)
+		goto not_found;
+
+	/* Same as btrfs_finish_ordered_io() */
+	btrfs_remove_ordered_extent(inode, entry);
+	btrfs_put_ordered_extent(entry);
+	btrfs_put_ordered_extent(entry);
+	return;
+
+not_found:
+	WARN_ON(1);
+	btrfs_err(root->fs_info,
+	"failed to find and clean ordered extent: root %llu ino %llu file_offset %llu len %llu",
+		  root->objectid, btrfs_ino(inode), file_offset, ram_len);
+	return;
+}
+
 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 {
 	struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5f2b0ca28705..7a989778aa89 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -163,6 +163,16 @@  btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
 void btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry);
+
+/*
+ * Function to cleanup an allocated ordered extent in error routine.
+ *
+ * As error handler in run_delalloc_range() will clear all related pages
+ * and skip their IO, we have no method to finish inserted ordered extent.
+ * So we must use this function to clean it up.
+ */
+void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
+				u64 ram_len);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
 				   u64 file_offset, u64 io_size, int uptodate);