bdi: Fix another oops in wb_workfn()

On 2018/06/13 0:57, Jan Kara wrote:
> On Mon 11-06-18 10:20:53, Tejun Heo wrote:
>> Hello,
>>
>> On Mon, Jun 11, 2018 at 06:29:20PM +0200, Jan Kara wrote:
>>>> Would something like the following work or am I missing the point
>>>> entirely?
>>>
>>> I was pondering the same solution for a while but I think it won't work.
>>> The problem is that e.g. wb_memcg_offline() could have already removed
>>> wb from the radix tree but it is still pending in bdi->wb_list
>>> (wb_shutdown() has not run yet) and so we'd drop reference we didn't get.
>>
>> Yeah, right, so the root cause is that we're walking the wb_list while
>> holding lock and expecting the object to stay there even after lock is
>> released.  Hmm... we can use a mutex to synchronize the two
>> destruction paths.  It's not like they're hot paths anyway.
> 
> Hmm, do you mean like having a per-bdi or even a global mutex that would
> protect whole wb_shutdown()? Yes, that should work and we could get rid of
> WB_shutting_down bit as well with that. Just it seems a bit strange to
> introduce a mutex only to synchronize these two shutdown paths - usually
> locks protect data structures and in this case we have cgwb_lock for
> that so it looks like a duplication from a first look.
> 

Can't we utilize RCU grace period (like shown below) ?

If wb_shutdown(wb) by cgwb_release_workfn() was faster than wb_shutdown(wb) by cgwb_bdi_unregister():

  cgwb_bdi_unregister(bdi)                     cgwb_release_workfn(work)

                                                 wb = container_of(work, struct bdi_writeback, release_work);
    spin_lock_irq(&cgwb_lock);
    wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); /* Same wb here */
    rcu_read_lock(); /* Prevent kfree_rcu() from invoking kfree() */
    spin_unlock_irq(&cgwb_lock);
                                                 wb_shutdown(wb);
                                                   spin_lock_bh(&wb->work_lock);
                                                   !test_and_clear_bit(WB_registered, &wb->state) is "false".
                                                   set_bit(WB_shutting_down, &wb->state);
                                                   spin_unlock_bh(&wb->work_lock);
                                                   mod_delayed_work(bdi_wq, &wb->dwork, 0);
                                                   flush_delayed_work(&wb->dwork);
                                                   cgwb_remove_from_bdi_list(wb);
                                                     spin_lock_irq(&cgwb_lock);
                                                     list_del_rcu(&wb->bdi_node);
                                                     spin_unlock_irq(&cgwb_lock);
                                                 wb_exit(wb);
                                                 kfree_rcu(wb, rcu); /* Won't call kfree() because of rcu_read_lock() */
    wb_shutdown(wb);
      spin_lock_bh(&wb->work_lock); /* Safe to access because kfree() cannot be called */
      !test_and_clear_bit(WB_registered, &wb->state) is "true".
      spin_unlock_bh(&wb->work_lock);
      rcu_read_unlock();
                                                   kfree(wb);
      schedule_timeout_uninterruptible(HZ / 10); /* Try to wait in case list_del_rcu() is not yet called */
    spin_lock_irq(&cgwb_lock);
    wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); /* Different wb if list_del_rcu() was already called, same wb otherwise */
    rcu_read_lock(); /* Prevent kfree_rcu() from invoking kfree() if still same wb here */
    spin_unlock_irq(&cgwb_lock);

If wb_shutdown(wb) by cgwb_bdi_unregister() was faster than wb_shutdown(wb) by cgwb_release_workfn():

  cgwb_bdi_unregister(bdi)                     cgwb_release_workfn(work)

                                                 wb = container_of(work, struct bdi_writeback, release_work);
    spin_lock_irq(&cgwb_lock);
    wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); /* Same wb here */
    rcu_read_lock();
    spin_unlock_irq(&cgwb_lock);
    wb_shutdown(wb);
      spin_lock_bh(&wb->work_lock);
      !test_and_clear_bit(WB_registered, &wb->state) is "false".
      set_bit(WB_shutting_down, &wb->state);
      spin_unlock_bh(&wb->work_lock);
      rcu_read_unlock();
      mod_delayed_work(bdi_wq, &wb->dwork, 0);
      flush_delayed_work(&wb->dwork);
      cgwb_remove_from_bdi_list(wb);
        spin_lock_irq(&cgwb_lock);
        list_del_rcu(&wb->bdi_node);
        spin_unlock_irq(&cgwb_lock);
    spin_lock_irq(&cgwb_lock);
    wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); /* Different wb here */
    rcu_read_lock();
    spin_unlock_irq(&cgwb_lock);
                                                 wb_shutdown(wb);
                                                   spin_lock_bh(&wb->work_lock); /* Safe to access because kfree() cannot be called */
                                                   !test_and_clear_bit(WB_registered, &wb->state) is "true".
                                                   spin_unlock_bh(&wb->work_lock);
                                                 wb_exit(wb);
                                                 kfree_rcu(wb, rcu);

 mm/backing-dev.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

--

bdi: Fix another oops in wb_workfn()

Commit Message

Comments

Patch