diff mbox series

[v2] fs/fs-writeback: wait isw_nr_in_flight to be zero when umount

Message ID 20190416120902.18616-1-jiufei.xue@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series [v2] fs/fs-writeback: wait isw_nr_in_flight to be zero when umount | expand

Commit Message

Jiufei Xue April 16, 2019, 12:09 p.m. UTC
synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb
switch may not go to the workqueue after synchronize_rcu(). Thus
previous scheduled switches was not finished even flushing the
workqueue, which will cause a NULL pointer dereferenced followed below.

VFS: Busy inodes after unmount of vdd. Self-destruct in 5 seconds.
Have a nice day...
BUG: unable to handle kernel NULL pointer dereference at
0000000000000278
[<ffffffff8126a303>] evict+0xb3/0x180
[<ffffffff8126a760>] iput+0x1b0/0x230
[<ffffffff8127c690>] inode_switch_wbs_work_fn+0x3c0/0x6a0
[<ffffffff810a5b2e>] worker_thread+0x4e/0x490
[<ffffffff810a5ae0>] ? process_one_work+0x410/0x410
[<ffffffff810ac056>] kthread+0xe6/0x100
[<ffffffff8173c199>] ret_from_fork+0x39/0x50

Here I don't use rcu_barrier() because it will wait for all the
rcu callbacks which is not appropriate.

Changes since v1: use per-sb s_isw_nr_in_flight to ensure that
s_isw_nr_in_flight will eventually zero.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Cc: stable@kernel.org
---
 fs/fs-writeback.c         | 22 +++++++++++++++-------
 fs/super.c                |  3 ++-
 include/linux/fs.h        |  2 ++
 include/linux/writeback.h |  4 ++--
 4 files changed, 21 insertions(+), 10 deletions(-)

Comments

Tejun Heo April 16, 2019, 3:04 p.m. UTC | #1
Hello, Jiufei.

On Tue, Apr 16, 2019 at 08:09:02PM +0800, Jiufei Xue wrote:
> synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb
> switch may not go to the workqueue after synchronize_rcu(). Thus
> previous scheduled switches was not finished even flushing the
> workqueue, which will cause a NULL pointer dereferenced followed below.

Isn't all that's needed replacing the synchronize_rcu() call with a
rcu_barrier() call?

Thanks.
Jiufei Xue April 17, 2019, 1:04 a.m. UTC | #2
Hi Tejun,

On 2019/4/16 下午11:04, Tejun Heo wrote:
> Hello, Jiufei.
> 
> On Tue, Apr 16, 2019 at 08:09:02PM +0800, Jiufei Xue wrote:
>> synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb
>> switch may not go to the workqueue after synchronize_rcu(). Thus
>> previous scheduled switches was not finished even flushing the
>> workqueue, which will cause a NULL pointer dereferenced followed below.
> 
> Isn't all that's needed replacing the synchronize_rcu() call with a
> rcu_barrier() call?
>

Yes, it can be fixed if we replace synchronize_rcu() with rcu_barrier().
However, I'm worried that rcu_barrier() is too heavyweight and we have
encountered some hung tasks that rcu_barrier() waiting for callbacks that
other drivers queued but not handled correctly.

Thanks,
Jiufei

> Thanks.
>
Tejun Heo April 17, 2019, 7:33 p.m. UTC | #3
Hello,

On Wed, Apr 17, 2019 at 09:04:48AM +0800, Jiufei Xue wrote:
> Yes, it can be fixed if we replace synchronize_rcu() with rcu_barrier().
> However, I'm worried that rcu_barrier() is too heavyweight and we have
> encountered some hung tasks that rcu_barrier() waiting for callbacks that
> other drivers queued but not handled correctly.

rcu_barrier() wait for the pending callbacks to finish and none of the
callbacks can block, so I don't think it'd be much worse than
synchronize_rcu().  Also, it'd probably make sense to inc
isw_nr_in_flight after call_rcu() in inode_switch_wbs().  Given that
all inodes must be gone by umount, the actual race window isn't there
but that ordering still makes a lot more sense.

Thanks.
diff mbox series

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 36855c1f8daf..370ac3a872f8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -237,7 +237,6 @@  static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
 
-static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
 static struct workqueue_struct *isw_wq;
 
 void __inode_attach_wb(struct inode *inode, struct page *page)
@@ -346,6 +345,7 @@  static void inode_switch_wbs_work_fn(struct work_struct *work)
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
@@ -456,7 +456,7 @@  static void inode_switch_wbs_work_fn(struct work_struct *work)
 	iput(inode);
 	kfree(isw);
 
-	atomic_dec(&isw_nr_in_flight);
+	atomic_dec(&sb->s_isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -479,6 +479,7 @@  static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
  */
 static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 {
+	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
@@ -523,7 +524,7 @@  static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	isw->inode = inode;
 
-	atomic_inc(&isw_nr_in_flight);
+	atomic_inc(&sb->s_isw_nr_in_flight);
 
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
@@ -898,12 +899,19 @@  static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
  * rare occurrences and synchronize_rcu() can take a while, perform
  * flushing iff wb switches are in flight.
  */
-void cgroup_writeback_umount(void)
+void cgroup_writeback_umount(struct super_block *sb)
 {
-	if (atomic_read(&isw_nr_in_flight)) {
-		synchronize_rcu();
+	if (!atomic_read(&sb->s_isw_nr_in_flight))
+		return;
+
+	synchronize_rcu();
+
+	/*
+	 * Now no more switched can be queued for this filesystem, just
+	 * wait for inflight switches finished.
+	 */
+	while (atomic_read(&sb->s_isw_nr_in_flight))
 		flush_workqueue(isw_wq);
-	}
 }
 
 static int __init cgroup_writeback_init(void)
diff --git a/fs/super.c b/fs/super.c
index 583a0124bc39..3d5ebf60b4ee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -248,6 +248,7 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	spin_lock_init(&s->s_inode_list_lock);
 	INIT_LIST_HEAD(&s->s_inodes_wb);
 	spin_lock_init(&s->s_inode_wblist_lock);
+	atomic_set(&s->s_isw_nr_in_flight, 0);
 
 	s->s_count = 1;
 	atomic_set(&s->s_active, 1);
@@ -445,7 +446,7 @@  void generic_shutdown_super(struct super_block *sb)
 		sb->s_flags &= ~SB_ACTIVE;
 
 		fsnotify_sb_delete(sb);
-		cgroup_writeback_umount();
+		cgroup_writeback_umount(sb);
 
 		evict_inodes(sb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e7679089..4e437e2723b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1519,6 +1519,8 @@  struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
+
+	atomic_t                s_isw_nr_in_flight;
 } __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..982299c92402 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -190,7 +190,7 @@  void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
-void cgroup_writeback_umount(void);
+void cgroup_writeback_umount(struct super_block *sb);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -296,7 +296,7 @@  static inline void wbc_account_io(struct writeback_control *wbc,
 {
 }
 
-static inline void cgroup_writeback_umount(void)
+static inline void cgroup_writeback_umount(struct super_block *sb)
 {
 }