diff mbox series

[v5,1/2] writeback, cgroup: keep list of inodes attached to bdi_writeback

Message ID 20210526222557.3118114-2-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series cgroup, blkcg: prevent dirty inodes to pin dying memory cgroups | expand

Commit Message

Roman Gushchin May 26, 2021, 10:25 p.m. UTC
Currently there is no way to iterate over inodes attached to a
specific cgwb structure. It limits the ability to efficiently
reclaim the writeback structure itself and associated memory and
block cgroup structures without scanning all inodes belonging to a sb,
which can be prohibitively expensive.

While dirty/in-active-writeback an inode belongs to one of the
bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time.
Once cleaned up, it's removed from all io lists. So the
inode->i_io_list can be reused to maintain the list of inodes,
attached to a bdi_writeback structure.

This patch introduces a new wb->b_attached list, which contains all
inodes which were dirty at least once and are attached to the given
cgwb. Inodes attached to the root bdi_writeback structures are never
placed on such list. The following patch will use this list to try to
release cgwbs structures more efficiently.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Roman Gushchin <guro@fb.com>
---
 fs/fs-writeback.c                | 66 ++++++++++++++++++++++++--------
 include/linux/backing-dev-defs.h |  1 +
 include/linux/backing-dev.h      |  7 ++++
 include/linux/writeback.h        |  1 +
 mm/backing-dev.c                 |  2 +
 5 files changed, 60 insertions(+), 17 deletions(-)

Comments

Jan Kara May 27, 2021, 10:35 a.m. UTC | #1
On Wed 26-05-21 15:25:56, Roman Gushchin wrote:
> Currently there is no way to iterate over inodes attached to a
> specific cgwb structure. It limits the ability to efficiently
> reclaim the writeback structure itself and associated memory and
> block cgroup structures without scanning all inodes belonging to a sb,
> which can be prohibitively expensive.
> 
> While dirty/in-active-writeback an inode belongs to one of the
> bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time.
> Once cleaned up, it's removed from all io lists. So the
> inode->i_io_list can be reused to maintain the list of inodes,
> attached to a bdi_writeback structure.
> 
> This patch introduces a new wb->b_attached list, which contains all
> inodes which were dirty at least once and are attached to the given
> cgwb. Inodes attached to the root bdi_writeback structures are never
> placed on such list. The following patch will use this list to try to
> release cgwbs structures more efficiently.
> 
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Roman Gushchin <guro@fb.com>

Looks good. Just some minor nits below:

> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index e91980f49388..631ef6366293 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -135,18 +135,23 @@ static bool inode_io_list_move_locked(struct inode *inode,
>   * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
>   * @inode: inode to be removed
>   * @wb: bdi_writeback @inode is being removed from
> + * @final: inode is going to be freed and can't reappear on any IO list
>   *
>   * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
>   * clear %WB_has_dirty_io if all are empty afterwards.
>   */
>  static void inode_io_list_del_locked(struct inode *inode,
> -				     struct bdi_writeback *wb)
> +				     struct bdi_writeback *wb,
> +				     bool final)
>  {
>  	assert_spin_locked(&wb->list_lock);
>  	assert_spin_locked(&inode->i_lock);
>  
>  	inode->i_state &= ~I_SYNC_QUEUED;
> -	list_del_init(&inode->i_io_list);
> +	if (final)
> +		list_del_init(&inode->i_io_list);
> +	else
> +		inode_cgwb_move_to_attached(inode, wb);
>  	wb_io_lists_depopulated(wb);
>  }

With these changes the naming is actually somewhat confusing and the bool
argument makes it even worse. Looking into the code I'd just fold
inode_io_list_del_locked() into inode_io_list_del() and make it really
delete inode from all IO lists. There are currently three other
inode_io_list_del_locked() users:

requeue_inode(), writeback_single_inode() - these should just call
inode_cgwb_move_to_attached() unconditionally
(inode_cgwb_move_to_attached() just needs to clear I_SYNC_QUEUED and call
wb_io_lists_depopulated() in addition to what it currently does).

inode_switch_wbs_work_fn() - I don't think it needs
inode_io_list_del_locked() at all. See below...

> @@ -278,6 +283,25 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
>  }
>  EXPORT_SYMBOL_GPL(__inode_attach_wb);
>  
> +/**
> + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
> + * @inode: inode of interest with i_lock held
> + * @wb: target bdi_writeback
> + *
> + * Remove the inode from wb's io lists and if necessarily put onto b_attached
> + * list.  Only inodes attached to cgwb's are kept on this list.
> + */
> +void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb)
> +{
> +	assert_spin_locked(&wb->list_lock);
> +	assert_spin_locked(&inode->i_lock);
> +
> +	if (wb != &wb->bdi->wb)
> +		list_move(&inode->i_io_list, &wb->b_attached);
> +	else
> +		list_del_init(&inode->i_io_list);
> +}

I think this can be static and you can drop the declarations from header
files below. At least I wasn't able to find where this would be used
outside of fs/writeback.c.

>  /**
>   * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
>   * @inode: inode of interest with i_lock held
> @@ -419,21 +443,29 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
>  	wb_get(new_wb);
>  
>  	/*
> -	 * Transfer to @new_wb's IO list if necessary.  The specific list
> -	 * @inode was on is ignored and the inode is put on ->b_dirty which
> -	 * is always correct including from ->b_dirty_time.  The transfer
> -	 * preserves @inode->dirtied_when ordering.
> +	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
> +	 * the specific list @inode was on is ignored and the @inode is put on
> +	 * ->b_dirty which is always correct including from ->b_dirty_time.
> +	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
> +	 * was clean, it means it was on the b_attached list, so move it onto
> +	 * the b_attached list of @new_wb.
>  	 */
>  	if (!list_empty(&inode->i_io_list)) {
> -		struct inode *pos;
> -
> -		inode_io_list_del_locked(inode, old_wb);
> +		inode_io_list_del_locked(inode, old_wb, true);

Do we need inode_io_list_del_locked() here at all? Below we are careful
enough to always use list_move() which does the deletion for us anyway.

>  		inode->i_wb = new_wb;
> -		list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
> -			if (time_after_eq(inode->dirtied_when,
> -					  pos->dirtied_when))
> -				break;
> -		inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
> +
> +		if (inode->i_state & I_DIRTY_ALL) {
> +			struct inode *pos;
> +
> +			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
> +				if (time_after_eq(inode->dirtied_when,
> +						  pos->dirtied_when))
> +					break;
> +			inode_io_list_move_locked(inode, new_wb,
> +						  pos->i_io_list.prev);
> +		} else {
> +			inode_cgwb_move_to_attached(inode, new_wb);
> +		}

								Honza
Roman Gushchin May 27, 2021, 4:32 p.m. UTC | #2
On Thu, May 27, 2021 at 12:35:17PM +0200, Jan Kara wrote:
> On Wed 26-05-21 15:25:56, Roman Gushchin wrote:
> > Currently there is no way to iterate over inodes attached to a
> > specific cgwb structure. It limits the ability to efficiently
> > reclaim the writeback structure itself and associated memory and
> > block cgroup structures without scanning all inodes belonging to a sb,
> > which can be prohibitively expensive.
> > 
> > While dirty/in-active-writeback an inode belongs to one of the
> > bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time.
> > Once cleaned up, it's removed from all io lists. So the
> > inode->i_io_list can be reused to maintain the list of inodes,
> > attached to a bdi_writeback structure.
> > 
> > This patch introduces a new wb->b_attached list, which contains all
> > inodes which were dirty at least once and are attached to the given
> > cgwb. Inodes attached to the root bdi_writeback structures are never
> > placed on such list. The following patch will use this list to try to
> > release cgwbs structures more efficiently.
> > 
> > Suggested-by: Jan Kara <jack@suse.cz>
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> 
> Looks good. Just some minor nits below:
> 
> > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > index e91980f49388..631ef6366293 100644
> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -135,18 +135,23 @@ static bool inode_io_list_move_locked(struct inode *inode,
> >   * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
> >   * @inode: inode to be removed
> >   * @wb: bdi_writeback @inode is being removed from
> > + * @final: inode is going to be freed and can't reappear on any IO list
> >   *
> >   * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
> >   * clear %WB_has_dirty_io if all are empty afterwards.
> >   */
> >  static void inode_io_list_del_locked(struct inode *inode,
> > -				     struct bdi_writeback *wb)
> > +				     struct bdi_writeback *wb,
> > +				     bool final)
> >  {
> >  	assert_spin_locked(&wb->list_lock);
> >  	assert_spin_locked(&inode->i_lock);
> >  
> >  	inode->i_state &= ~I_SYNC_QUEUED;
> > -	list_del_init(&inode->i_io_list);
> > +	if (final)
> > +		list_del_init(&inode->i_io_list);
> > +	else
> > +		inode_cgwb_move_to_attached(inode, wb);
> >  	wb_io_lists_depopulated(wb);
> >  }
> 
> With these changes the naming is actually somewhat confusing and the bool
> argument makes it even worse. Looking into the code I'd just fold
> inode_io_list_del_locked() into inode_io_list_del() and make it really
> delete inode from all IO lists. There are currently three other
> inode_io_list_del_locked() users:

Yeah, good idea. Will do in the next version. Thanks!
diff mbox series

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e91980f49388..631ef6366293 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -135,18 +135,23 @@  static bool inode_io_list_move_locked(struct inode *inode,
  * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
  * @inode: inode to be removed
  * @wb: bdi_writeback @inode is being removed from
+ * @final: inode is going to be freed and can't reappear on any IO list
  *
  * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
  * clear %WB_has_dirty_io if all are empty afterwards.
  */
 static void inode_io_list_del_locked(struct inode *inode,
-				     struct bdi_writeback *wb)
+				     struct bdi_writeback *wb,
+				     bool final)
 {
 	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
 
 	inode->i_state &= ~I_SYNC_QUEUED;
-	list_del_init(&inode->i_io_list);
+	if (final)
+		list_del_init(&inode->i_io_list);
+	else
+		inode_cgwb_move_to_attached(inode, wb);
 	wb_io_lists_depopulated(wb);
 }
 
@@ -278,6 +283,25 @@  void __inode_attach_wb(struct inode *inode, struct page *page)
 }
 EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
+/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list.  Only inodes attached to cgwb's are kept on this list.
+ */
+void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	assert_spin_locked(&inode->i_lock);
+
+	if (wb != &wb->bdi->wb)
+		list_move(&inode->i_io_list, &wb->b_attached);
+	else
+		list_del_init(&inode->i_io_list);
+}
+
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
  * @inode: inode of interest with i_lock held
@@ -419,21 +443,29 @@  static void inode_switch_wbs_work_fn(struct work_struct *work)
 	wb_get(new_wb);
 
 	/*
-	 * Transfer to @new_wb's IO list if necessary.  The specific list
-	 * @inode was on is ignored and the inode is put on ->b_dirty which
-	 * is always correct including from ->b_dirty_time.  The transfer
-	 * preserves @inode->dirtied_when ordering.
+	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
+	 * the specific list @inode was on is ignored and the @inode is put on
+	 * ->b_dirty which is always correct including from ->b_dirty_time.
+	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
+	 * was clean, it means it was on the b_attached list, so move it onto
+	 * the b_attached list of @new_wb.
 	 */
 	if (!list_empty(&inode->i_io_list)) {
-		struct inode *pos;
-
-		inode_io_list_del_locked(inode, old_wb);
+		inode_io_list_del_locked(inode, old_wb, true);
 		inode->i_wb = new_wb;
-		list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
-			if (time_after_eq(inode->dirtied_when,
-					  pos->dirtied_when))
-				break;
-		inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+		if (inode->i_state & I_DIRTY_ALL) {
+			struct inode *pos;
+
+			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+				if (time_after_eq(inode->dirtied_when,
+						  pos->dirtied_when))
+					break;
+			inode_io_list_move_locked(inode, new_wb,
+						  pos->i_io_list.prev);
+		} else {
+			inode_cgwb_move_to_attached(inode, new_wb);
+		}
 	} else {
 		inode->i_wb = new_wb;
 	}
@@ -1124,7 +1156,7 @@  void inode_io_list_del(struct inode *inode)
 
 	wb = inode_to_wb_and_lock_list(inode);
 	spin_lock(&inode->i_lock);
-	inode_io_list_del_locked(inode, wb);
+	inode_io_list_del_locked(inode, wb, true);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&wb->list_lock);
 }
@@ -1437,7 +1469,7 @@  static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		inode->i_state &= ~I_SYNC_QUEUED;
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		inode_io_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb, false);
 	}
 }
 
@@ -1589,7 +1621,7 @@  static int writeback_single_inode(struct inode *inode,
 	 * responsible for the writeback lists.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		inode_io_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb, false);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index fff9367a6348..e5dc238ebe4f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -154,6 +154,7 @@  struct bdi_writeback {
 	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
+	struct list_head b_attached;	/* attached inodes, protected by list_lock */
 
 	union {
 		struct work_struct release_work;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 44df4fcef65c..4256e66802e6 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -177,6 +177,7 @@  struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 void wb_memcg_offline(struct mem_cgroup *memcg);
 void wb_blkcg_offline(struct blkcg *blkcg);
 int inode_congested(struct inode *inode, int cong_bits);
+void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb);
 
 /**
  * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
@@ -345,6 +346,12 @@  static inline bool inode_cgwb_enabled(struct inode *inode)
 	return false;
 }
 
+static inline void inode_cgwb_move_to_attached(struct inode *inode,
+					       struct bdi_writeback *wb)
+{
+	list_del_init(&inode->i_io_list);
+}
+
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
 {
 	return &bdi->wb;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8e5c5bb16e2d..572a13c40c90 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -212,6 +212,7 @@  static inline void wait_on_inode(struct inode *inode)
 #include <linux/bio.h>
 
 void __inode_attach_wb(struct inode *inode, struct page *page);
+void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb);
 void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 576220acd686..54c5dc4b8c24 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -396,6 +396,7 @@  static void cgwb_release_workfn(struct work_struct *work)
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
+	WARN_ON_ONCE(!list_empty(&wb->b_attached));
 	kfree_rcu(wb, rcu);
 }
 
@@ -472,6 +473,7 @@  static int cgwb_create(struct backing_dev_info *bdi,
 
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
+	INIT_LIST_HEAD(&wb->b_attached);
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
 	set_bit(WB_registered, &wb->state);