diff mbox series

[v4,07/11] mm: memcontrol: make all the callers of {folio,page}_memcg() safe

Message ID 20220524060551.80037-8-songmuchun@bytedance.com (mailing list archive)
State New
Headers show
Series Use obj_cgroup APIs to charge the LRU pages | expand

Commit Message

Muchun Song May 24, 2022, 6:05 a.m. UTC
When we use objcg APIs to charge the LRU pages, the page will not hold
a reference to the memcg associated with the page. So the caller of the
{folio,page}_memcg() should hold an rcu read lock or obtain a reference
to the memcg associated with the page to protect memcg from being
released. So introduce get_mem_cgroup_from_{page,folio}() to obtain a
reference to the memory cgroup associated with the page.

In this patch, make all the callers hold an rcu read lock or obtain a
reference to the memcg to protect memcg from being released when the LRU
pages reparented.

We do not need to adjust the callers of {folio,page}_memcg() during
the whole process of mem_cgroup_move_task(). Because the cgroup migration
and memory cgroup offlining are serialized by @cgroup_mutex. In this
routine, the LRU pages cannot be reparented to its parent memory cgroup.
So {folio,page}_memcg() is stable and cannot be released.

This is a preparation for reparenting the LRU pages.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 fs/buffer.c                      |  4 +--
 fs/fs-writeback.c                | 23 ++++++++--------
 include/linux/memcontrol.h       | 51 ++++++++++++++++++++++++++++++++---
 include/trace/events/writeback.h |  5 ++++
 mm/memcontrol.c                  | 58 ++++++++++++++++++++++++++++++----------
 mm/migrate.c                     |  4 +++
 mm/page_io.c                     |  5 ++--
 7 files changed, 117 insertions(+), 33 deletions(-)

Comments

Roman Gushchin May 25, 2022, 3:03 a.m. UTC | #1
On Tue, May 24, 2022 at 02:05:47PM +0800, Muchun Song wrote:
> When we use objcg APIs to charge the LRU pages, the page will not hold
> a reference to the memcg associated with the page. So the caller of the
> {folio,page}_memcg() should hold an rcu read lock or obtain a reference
> to the memcg associated with the page to protect memcg from being
> released. So introduce get_mem_cgroup_from_{page,folio}() to obtain a
> reference to the memory cgroup associated with the page.
> 
> In this patch, make all the callers hold an rcu read lock or obtain a
> reference to the memcg to protect memcg from being released when the LRU
> pages reparented.
> 
> We do not need to adjust the callers of {folio,page}_memcg() during
> the whole process of mem_cgroup_move_task(). Because the cgroup migration
> and memory cgroup offlining are serialized by @cgroup_mutex. In this
> routine, the LRU pages cannot be reparented to its parent memory cgroup.
> So {folio,page}_memcg() is stable and cannot be released.
> 
> This is a preparation for reparenting the LRU pages.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>

Overall the patch looks good to me (some nits below).

> ---
>  fs/buffer.c                      |  4 +--
>  fs/fs-writeback.c                | 23 ++++++++--------
>  include/linux/memcontrol.h       | 51 ++++++++++++++++++++++++++++++++---
>  include/trace/events/writeback.h |  5 ++++
>  mm/memcontrol.c                  | 58 ++++++++++++++++++++++++++++++----------
>  mm/migrate.c                     |  4 +++
>  mm/page_io.c                     |  5 ++--
>  7 files changed, 117 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 2b5561ae5d0b..80975a457670 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
>  	if (retry)
>  		gfp |= __GFP_NOFAIL;
>  
> -	/* The page lock pins the memcg */
> -	memcg = page_memcg(page);
> +	memcg = get_mem_cgroup_from_page(page);

Looking at these changes I wonder if we need to remove unsafe getters or
at least add a BOLD comment on how/when it's safe to use them.

>  	old_memcg = set_active_memcg(memcg);
>  
>  	head = NULL;
> @@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
>  		set_bh_page(bh, page, offset);
>  	}
>  out:
> +	mem_cgroup_put(memcg);
>  	set_active_memcg(old_memcg);
>  	return head;
>  /*
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 1fae0196292a..56612ace8778 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -243,15 +243,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
>  	if (inode_cgwb_enabled(inode)) {
>  		struct cgroup_subsys_state *memcg_css;
>  
> -		if (page) {
> -			memcg_css = mem_cgroup_css_from_page(page);
> -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> -		} else {
> -			/* must pin memcg_css, see wb_get_create() */
> +		/* must pin memcg_css, see wb_get_create() */
> +		if (page)
> +			memcg_css = get_mem_cgroup_css_from_page(page);
> +		else
>  			memcg_css = task_get_css(current, memory_cgrp_id);
> -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> -			css_put(memcg_css);
> -		}
> +		wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> +		css_put(memcg_css);
>  	}
>  
>  	if (!wb)
> @@ -868,16 +866,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
>  	if (!wbc->wb || wbc->no_cgroup_owner)
>  		return;
>  
> -	css = mem_cgroup_css_from_page(page);
> +	css = get_mem_cgroup_css_from_page(page);
>  	/* dead cgroups shouldn't contribute to inode ownership arbitration */
>  	if (!(css->flags & CSS_ONLINE))
> -		return;
> +		goto out;
>  
>  	id = css->id;
>  
>  	if (id == wbc->wb_id) {
>  		wbc->wb_bytes += bytes;
> -		return;
> +		goto out;
>  	}
>  
>  	if (id == wbc->wb_lcand_id)
> @@ -890,6 +888,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
>  		wbc->wb_tcand_bytes += bytes;
>  	else
>  		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
> +
> +out:
> +	css_put(css);
>  }
>  EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
>  
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 8c2f1ba2f471..3a0e2592434e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -373,7 +373,7 @@ static inline bool folio_memcg_kmem(struct folio *folio);
>   * a valid memcg, but can be atomically swapped to the parent memcg.
>   *
>   * The caller must ensure that the returned memcg won't be released:
> - * e.g. acquire the rcu_read_lock or css_set_lock.
> + * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
>   */
>  static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
>  {
> @@ -454,7 +454,37 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
>  	return folio_memcg(page_folio(page));
>  }
>  
> -/**
> +/*
> + * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
> + *			       associated with a folio.
> + * @folio: Pointer to the folio.
> + *
> + * Returns a pointer to the memory cgroup (and obtain a reference on it)
> + * associated with the folio, or NULL. This function assumes that the
> + * folio is known to have a proper memory cgroup pointer. It's not safe
> + * to call this function against some type of pages, e.g. slab pages or
> + * ex-slab pages.
> + */
> +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	rcu_read_lock();
> +retry:
> +	memcg = folio_memcg(folio);
> +	if (unlikely(memcg && !css_tryget(&memcg->css)))
> +		goto retry;
> +	rcu_read_unlock();
> +
> +	return memcg;
> +}
> +
> +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> +{
> +	return get_mem_cgroup_from_folio(page_folio(page));
> +}
> +
> +/*
>   * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
>   * @folio: Pointer to the folio.
>   *
> @@ -873,7 +903,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
>  	return match;
>  }
>  
> -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
> +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
>  ino_t page_cgroup_ino(struct page *page);
>  
>  static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
> @@ -1047,10 +1077,13 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
>  static inline void count_memcg_page_event(struct page *page,
>  					  enum vm_event_item idx)
>  {
> -	struct mem_cgroup *memcg = page_memcg(page);
> +	struct mem_cgroup *memcg;
>  
> +	rcu_read_lock();
> +	memcg = page_memcg(page);
>  	if (memcg)
>  		count_memcg_events(memcg, idx, 1);
> +	rcu_read_unlock();
>  }
>  
>  static inline void count_memcg_event_mm(struct mm_struct *mm,
> @@ -1129,6 +1162,16 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
>  	return NULL;
>  }
>  
> +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +	return NULL;
> +}
> +
> +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> +{
> +	return NULL;
> +}
> +
>  static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
>  {
>  	WARN_ON_ONCE(!rcu_read_lock_held());
> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> index 86b2a82da546..cdb822339f13 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -258,6 +258,11 @@ TRACE_EVENT(track_foreign_dirty,
>  		__entry->ino		= inode ? inode->i_ino : 0;
>  		__entry->memcg_id	= wb->memcg_css->id;
>  		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> +		/*
> +		 * TP_fast_assign() is under preemption disabled which can
> +		 * serve as an RCU read-side critical section so that the
> +		 * memcg returned by folio_memcg() cannot be freed.
> +		 */
>  		__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
>  	),
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b38a77f6696f..dcaf6cf5dc74 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -371,7 +371,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
>  #endif
>  
>  /**
> - * mem_cgroup_css_from_page - css of the memcg associated with a page
> + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
>   * @page: page of interest
>   *
>   * If memcg is bound to the default hierarchy, css of the memcg associated
> @@ -381,13 +381,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
>   * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
>   * is returned.
>   */
> -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
> +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
>  {
>  	struct mem_cgroup *memcg;
>  
> -	memcg = page_memcg(page);
> +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +		return &root_mem_cgroup->css;
>  
> -	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +	memcg = get_mem_cgroup_from_page(page);
> +	if (!memcg)
>  		memcg = root_mem_cgroup;
>  
>  	return &memcg->css;
> @@ -770,13 +772,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
>  			     int val)
>  {
> -	struct page *head = compound_head(page); /* rmap on tail pages */
> +	struct folio *folio = page_folio(page); /* rmap on tail pages */
>  	struct mem_cgroup *memcg;
>  	pg_data_t *pgdat = page_pgdat(page);
>  	struct lruvec *lruvec;
>  
>  	rcu_read_lock();
> -	memcg = page_memcg(head);
> +	memcg = folio_memcg(folio);
>  	/* Untracked pages have no memcg, no lruvec. Update only the node */
>  	if (!memcg) {
>  		rcu_read_unlock();
> @@ -2058,7 +2060,9 @@ void folio_memcg_lock(struct folio *folio)
>  	 * The RCU lock is held throughout the transaction.  The fast
>  	 * path can get away without acquiring the memcg->move_lock
>  	 * because page moving starts with an RCU grace period.
> -         */
> +	 *
> +	 * The RCU lock also protects the memcg from being freed.
> +	 */
>  	rcu_read_lock();
>  
>  	if (mem_cgroup_disabled())
> @@ -3296,7 +3300,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
>  void split_page_memcg(struct page *head, unsigned int nr)
>  {
>  	struct folio *folio = page_folio(head);
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
>  	int i;
>  
>  	if (mem_cgroup_disabled() || !memcg)
> @@ -3309,6 +3313,8 @@ void split_page_memcg(struct page *head, unsigned int nr)
>  		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
>  	else
>  		css_get_many(&memcg->css, nr - 1);
> +
> +	css_put(&memcg->css);
>  }
>  
>  #ifdef CONFIG_MEMCG_SWAP
> @@ -4511,7 +4517,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>  void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
>  					     struct bdi_writeback *wb)
>  {
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
>  	struct memcg_cgwb_frn *frn;
>  	u64 now = get_jiffies_64();
>  	u64 oldest_at = now;
> @@ -4558,6 +4564,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
>  		frn->memcg_id = wb->memcg_css->id;
>  		frn->at = now;
>  	}
> +	css_put(&memcg->css);
>  }
>  
>  /* issue foreign writeback flushes for recorded foreign dirtying events */
> @@ -6092,6 +6099,14 @@ static void mem_cgroup_move_charge(void)
>  	atomic_dec(&mc.from->moving_account);
>  }
>  
> +/*
> + * The cgroup migration and memory cgroup offlining are serialized by
> + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
> + * be reparented to its parent memory cgroup. So during the whole process
> + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
> + * need to worry about the memcg (returned from page_memcg()) being
> + * released even if we do not hold an rcu read lock.
> + */
>  static void mem_cgroup_move_task(void)
>  {
>  	if (mc.to) {
> @@ -6895,7 +6910,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
>  	if (folio_memcg(new))
>  		return;
>  
> -	memcg = folio_memcg(old);
> +	memcg = get_mem_cgroup_from_folio(old);
>  	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
>  	if (!memcg)
>  		return;
> @@ -6914,6 +6929,8 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
>  	mem_cgroup_charge_statistics(memcg, nr_pages);
>  	memcg_check_events(memcg, folio_nid(new));
>  	local_irq_restore(flags);
> +
> +	css_put(&memcg->css);
>  }
>  
>  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
> @@ -7100,6 +7117,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
>  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return;
>  
> +	/*
> +	 * Interrupts should be disabled by the caller (see the comments below),
> +	 * which can serve as RCU read-side critical sections.
> +	 */
>  	memcg = folio_memcg(folio);
>  
>  	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> @@ -7165,15 +7186,16 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return 0;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  
>  	VM_WARN_ON_ONCE_PAGE(!memcg, page);
>  	if (!memcg)
> -		return 0;
> +		goto out;
>  
>  	if (!entry.val) {
>  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> -		return 0;
> +		goto out;
>  	}
>  
>  	memcg = mem_cgroup_id_get_online(memcg);
> @@ -7183,6 +7205,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
>  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
>  		mem_cgroup_id_put(memcg);
> +		rcu_read_unlock();
>  		return -ENOMEM;

If you add the "out" label, please use it here too.

>  	}
>  
> @@ -7192,6 +7215,8 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
>  	VM_BUG_ON_PAGE(oldid, page);
>  	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
> +out:
> +	rcu_read_unlock();
>  
>  	return 0;
>  }
> @@ -7246,17 +7271,22 @@ bool mem_cgroup_swap_full(struct page *page)
>  	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return false;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  	if (!memcg)
> -		return false;
> +		goto out;
>  
>  	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
>  		unsigned long usage = page_counter_read(&memcg->swap);
>  
>  		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
> -		    usage * 2 >= READ_ONCE(memcg->swap.max))
> +		    usage * 2 >= READ_ONCE(memcg->swap.max)) {
> +			rcu_read_unlock();
>  			return true;

Please, make something like
	ret = true;
	goto out;
here. It will be more consistent.

> +		}
>  	}
> +out:
> +	rcu_read_unlock();
>  
>  	return false;
>  }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6c31ee1e1c9b..59e97a8a64a0 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -430,6 +430,10 @@ int folio_migrate_mapping(struct address_space *mapping,
>  		struct lruvec *old_lruvec, *new_lruvec;
>  		struct mem_cgroup *memcg;
>  
> +		/*
> +		 * Irq is disabled, which can serve as RCU read-side critical
> +		 * sections.
> +		 */
>  		memcg = folio_memcg(folio);
>  		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
>  		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 89fbf3cae30f..a0d9cd68e87a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -221,13 +221,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
>  	struct cgroup_subsys_state *css;
>  	struct mem_cgroup *memcg;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  	if (!memcg)
> -		return;
> +		goto out;
>  
> -	rcu_read_lock();
>  	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
>  	bio_associate_blkg_from_css(bio, css);
> +out:
>  	rcu_read_unlock();
>  }
>  #else
> -- 
> 2.11.0
>
Muchun Song May 25, 2022, 11:51 a.m. UTC | #2
On Tue, May 24, 2022 at 08:03:41PM -0700, Roman Gushchin wrote:
> On Tue, May 24, 2022 at 02:05:47PM +0800, Muchun Song wrote:
> > When we use objcg APIs to charge the LRU pages, the page will not hold
> > a reference to the memcg associated with the page. So the caller of the
> > {folio,page}_memcg() should hold an rcu read lock or obtain a reference
> > to the memcg associated with the page to protect memcg from being
> > released. So introduce get_mem_cgroup_from_{page,folio}() to obtain a
> > reference to the memory cgroup associated with the page.
> > 
> > In this patch, make all the callers hold an rcu read lock or obtain a
> > reference to the memcg to protect memcg from being released when the LRU
> > pages reparented.
> > 
> > We do not need to adjust the callers of {folio,page}_memcg() during
> > the whole process of mem_cgroup_move_task(). Because the cgroup migration
> > and memory cgroup offlining are serialized by @cgroup_mutex. In this
> > routine, the LRU pages cannot be reparented to its parent memory cgroup.
> > So {folio,page}_memcg() is stable and cannot be released.
> > 
> > This is a preparation for reparenting the LRU pages.
> > 
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> 
> Overall the patch looks good to me (some nits below).
> 
> > ---
> >  fs/buffer.c                      |  4 +--
> >  fs/fs-writeback.c                | 23 ++++++++--------
> >  include/linux/memcontrol.h       | 51 ++++++++++++++++++++++++++++++++---
> >  include/trace/events/writeback.h |  5 ++++
> >  mm/memcontrol.c                  | 58 ++++++++++++++++++++++++++++++----------
> >  mm/migrate.c                     |  4 +++
> >  mm/page_io.c                     |  5 ++--
> >  7 files changed, 117 insertions(+), 33 deletions(-)
> > 
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index 2b5561ae5d0b..80975a457670 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
> >  	if (retry)
> >  		gfp |= __GFP_NOFAIL;
> >  
> > -	/* The page lock pins the memcg */
> > -	memcg = page_memcg(page);
> > +	memcg = get_mem_cgroup_from_page(page);
> 
> Looking at these changes I wonder if we need to remove unsafe getters or
> at least add a BOLD comment on how/when it's safe to use them.
>

I am not clear here. You mean we add some comments above page_memcg()
or get_mem_cgroup_from_page()?

> >  	old_memcg = set_active_memcg(memcg);
> >  
> >  	head = NULL;
> > @@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
> >  		set_bh_page(bh, page, offset);
> >  	}
> >  out:
> > +	mem_cgroup_put(memcg);
> >  	set_active_memcg(old_memcg);
> >  	return head;
> >  /*
> > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > index 1fae0196292a..56612ace8778 100644
> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -243,15 +243,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
> >  	if (inode_cgwb_enabled(inode)) {
> >  		struct cgroup_subsys_state *memcg_css;
> >  
> > -		if (page) {
> > -			memcg_css = mem_cgroup_css_from_page(page);
> > -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > -		} else {
> > -			/* must pin memcg_css, see wb_get_create() */
> > +		/* must pin memcg_css, see wb_get_create() */
> > +		if (page)
> > +			memcg_css = get_mem_cgroup_css_from_page(page);
> > +		else
> >  			memcg_css = task_get_css(current, memory_cgrp_id);
> > -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > -			css_put(memcg_css);
> > -		}
> > +		wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > +		css_put(memcg_css);
> >  	}
> >  
> >  	if (!wb)
> > @@ -868,16 +866,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
> >  	if (!wbc->wb || wbc->no_cgroup_owner)
> >  		return;
> >  
> > -	css = mem_cgroup_css_from_page(page);
> > +	css = get_mem_cgroup_css_from_page(page);
> >  	/* dead cgroups shouldn't contribute to inode ownership arbitration */
> >  	if (!(css->flags & CSS_ONLINE))
> > -		return;
> > +		goto out;
> >  
> >  	id = css->id;
> >  
> >  	if (id == wbc->wb_id) {
> >  		wbc->wb_bytes += bytes;
> > -		return;
> > +		goto out;
> >  	}
> >  
> >  	if (id == wbc->wb_lcand_id)
> > @@ -890,6 +888,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
> >  		wbc->wb_tcand_bytes += bytes;
> >  	else
> >  		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
> > +
> > +out:
> > +	css_put(css);
> >  }
> >  EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
> >  
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 8c2f1ba2f471..3a0e2592434e 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -373,7 +373,7 @@ static inline bool folio_memcg_kmem(struct folio *folio);
> >   * a valid memcg, but can be atomically swapped to the parent memcg.
> >   *
> >   * The caller must ensure that the returned memcg won't be released:
> > - * e.g. acquire the rcu_read_lock or css_set_lock.
> > + * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
> >   */
> >  static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> >  {
> > @@ -454,7 +454,37 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> >  	return folio_memcg(page_folio(page));
> >  }
> >  
> > -/**
> > +/*
> > + * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
> > + *			       associated with a folio.
> > + * @folio: Pointer to the folio.
> > + *
> > + * Returns a pointer to the memory cgroup (and obtain a reference on it)
> > + * associated with the folio, or NULL. This function assumes that the
> > + * folio is known to have a proper memory cgroup pointer. It's not safe
> > + * to call this function against some type of pages, e.g. slab pages or
> > + * ex-slab pages.
> > + */
> > +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> > +{
> > +	struct mem_cgroup *memcg;
> > +
> > +	rcu_read_lock();
> > +retry:
> > +	memcg = folio_memcg(folio);
> > +	if (unlikely(memcg && !css_tryget(&memcg->css)))
> > +		goto retry;
> > +	rcu_read_unlock();
> > +
> > +	return memcg;
> > +}
> > +
> > +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> > +{
> > +	return get_mem_cgroup_from_folio(page_folio(page));
> > +}
> > +
> > +/*
> >   * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
> >   * @folio: Pointer to the folio.
> >   *
> > @@ -873,7 +903,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
> >  	return match;
> >  }
> >  
> > -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
> > +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
> >  ino_t page_cgroup_ino(struct page *page);
> >  
> >  static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
> > @@ -1047,10 +1077,13 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
> >  static inline void count_memcg_page_event(struct page *page,
> >  					  enum vm_event_item idx)
> >  {
> > -	struct mem_cgroup *memcg = page_memcg(page);
> > +	struct mem_cgroup *memcg;
> >  
> > +	rcu_read_lock();
> > +	memcg = page_memcg(page);
> >  	if (memcg)
> >  		count_memcg_events(memcg, idx, 1);
> > +	rcu_read_unlock();
> >  }
> >  
> >  static inline void count_memcg_event_mm(struct mm_struct *mm,
> > @@ -1129,6 +1162,16 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> >  	return NULL;
> >  }
> >  
> > +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> > +{
> > +	return NULL;
> > +}
> > +
> > +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> > +{
> > +	return NULL;
> > +}
> > +
> >  static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
> >  {
> >  	WARN_ON_ONCE(!rcu_read_lock_held());
> > diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> > index 86b2a82da546..cdb822339f13 100644
> > --- a/include/trace/events/writeback.h
> > +++ b/include/trace/events/writeback.h
> > @@ -258,6 +258,11 @@ TRACE_EVENT(track_foreign_dirty,
> >  		__entry->ino		= inode ? inode->i_ino : 0;
> >  		__entry->memcg_id	= wb->memcg_css->id;
> >  		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> > +		/*
> > +		 * TP_fast_assign() is under preemption disabled which can
> > +		 * serve as an RCU read-side critical section so that the
> > +		 * memcg returned by folio_memcg() cannot be freed.
> > +		 */
> >  		__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
> >  	),
> >  
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index b38a77f6696f..dcaf6cf5dc74 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -371,7 +371,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
> >  #endif
> >  
> >  /**
> > - * mem_cgroup_css_from_page - css of the memcg associated with a page
> > + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
> >   * @page: page of interest
> >   *
> >   * If memcg is bound to the default hierarchy, css of the memcg associated
> > @@ -381,13 +381,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
> >   * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
> >   * is returned.
> >   */
> > -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
> > +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
> >  {
> >  	struct mem_cgroup *memcg;
> >  
> > -	memcg = page_memcg(page);
> > +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > +		return &root_mem_cgroup->css;
> >  
> > -	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > +	memcg = get_mem_cgroup_from_page(page);
> > +	if (!memcg)
> >  		memcg = root_mem_cgroup;
> >  
> >  	return &memcg->css;
> > @@ -770,13 +772,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> >  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> >  			     int val)
> >  {
> > -	struct page *head = compound_head(page); /* rmap on tail pages */
> > +	struct folio *folio = page_folio(page); /* rmap on tail pages */
> >  	struct mem_cgroup *memcg;
> >  	pg_data_t *pgdat = page_pgdat(page);
> >  	struct lruvec *lruvec;
> >  
> >  	rcu_read_lock();
> > -	memcg = page_memcg(head);
> > +	memcg = folio_memcg(folio);
> >  	/* Untracked pages have no memcg, no lruvec. Update only the node */
> >  	if (!memcg) {
> >  		rcu_read_unlock();
> > @@ -2058,7 +2060,9 @@ void folio_memcg_lock(struct folio *folio)
> >  	 * The RCU lock is held throughout the transaction.  The fast
> >  	 * path can get away without acquiring the memcg->move_lock
> >  	 * because page moving starts with an RCU grace period.
> > -         */
> > +	 *
> > +	 * The RCU lock also protects the memcg from being freed.
> > +	 */
> >  	rcu_read_lock();
> >  
> >  	if (mem_cgroup_disabled())
> > @@ -3296,7 +3300,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
> >  void split_page_memcg(struct page *head, unsigned int nr)
> >  {
> >  	struct folio *folio = page_folio(head);
> > -	struct mem_cgroup *memcg = folio_memcg(folio);
> > +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
> >  	int i;
> >  
> >  	if (mem_cgroup_disabled() || !memcg)
> > @@ -3309,6 +3313,8 @@ void split_page_memcg(struct page *head, unsigned int nr)
> >  		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
> >  	else
> >  		css_get_many(&memcg->css, nr - 1);
> > +
> > +	css_put(&memcg->css);
> >  }
> >  
> >  #ifdef CONFIG_MEMCG_SWAP
> > @@ -4511,7 +4517,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> >  void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
> >  					     struct bdi_writeback *wb)
> >  {
> > -	struct mem_cgroup *memcg = folio_memcg(folio);
> > +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
> >  	struct memcg_cgwb_frn *frn;
> >  	u64 now = get_jiffies_64();
> >  	u64 oldest_at = now;
> > @@ -4558,6 +4564,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
> >  		frn->memcg_id = wb->memcg_css->id;
> >  		frn->at = now;
> >  	}
> > +	css_put(&memcg->css);
> >  }
> >  
> >  /* issue foreign writeback flushes for recorded foreign dirtying events */
> > @@ -6092,6 +6099,14 @@ static void mem_cgroup_move_charge(void)
> >  	atomic_dec(&mc.from->moving_account);
> >  }
> >  
> > +/*
> > + * The cgroup migration and memory cgroup offlining are serialized by
> > + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
> > + * be reparented to its parent memory cgroup. So during the whole process
> > + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
> > + * need to worry about the memcg (returned from page_memcg()) being
> > + * released even if we do not hold an rcu read lock.
> > + */
> >  static void mem_cgroup_move_task(void)
> >  {
> >  	if (mc.to) {
> > @@ -6895,7 +6910,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
> >  	if (folio_memcg(new))
> >  		return;
> >  
> > -	memcg = folio_memcg(old);
> > +	memcg = get_mem_cgroup_from_folio(old);
> >  	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
> >  	if (!memcg)
> >  		return;
> > @@ -6914,6 +6929,8 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
> >  	mem_cgroup_charge_statistics(memcg, nr_pages);
> >  	memcg_check_events(memcg, folio_nid(new));
> >  	local_irq_restore(flags);
> > +
> > +	css_put(&memcg->css);
> >  }
> >  
> >  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
> > @@ -7100,6 +7117,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
> >  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> >  		return;
> >  
> > +	/*
> > +	 * Interrupts should be disabled by the caller (see the comments below),
> > +	 * which can serve as RCU read-side critical sections.
> > +	 */
> >  	memcg = folio_memcg(folio);
> >  
> >  	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> > @@ -7165,15 +7186,16 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> >  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> >  		return 0;
> >  
> > +	rcu_read_lock();
> >  	memcg = page_memcg(page);
> >  
> >  	VM_WARN_ON_ONCE_PAGE(!memcg, page);
> >  	if (!memcg)
> > -		return 0;
> > +		goto out;
> >  
> >  	if (!entry.val) {
> >  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> > -		return 0;
> > +		goto out;
> >  	}
> >  
> >  	memcg = mem_cgroup_id_get_online(memcg);
> > @@ -7183,6 +7205,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> >  		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
> >  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> >  		mem_cgroup_id_put(memcg);
> > +		rcu_read_unlock();
> >  		return -ENOMEM;
> 
> If you add the "out" label, please use it here too.
>

Good point. Will do.

> >  	}
> >  
> > @@ -7192,6 +7215,8 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> >  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
> >  	VM_BUG_ON_PAGE(oldid, page);
> >  	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
> > +out:
> > +	rcu_read_unlock();
> >  
> >  	return 0;
> >  }
> > @@ -7246,17 +7271,22 @@ bool mem_cgroup_swap_full(struct page *page)
> >  	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> >  		return false;
> >  
> > +	rcu_read_lock();
> >  	memcg = page_memcg(page);
> >  	if (!memcg)
> > -		return false;
> > +		goto out;
> >  
> >  	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
> >  		unsigned long usage = page_counter_read(&memcg->swap);
> >  
> >  		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
> > -		    usage * 2 >= READ_ONCE(memcg->swap.max))
> > +		    usage * 2 >= READ_ONCE(memcg->swap.max)) {
> > +			rcu_read_unlock();
> >  			return true;
> 
> Please, make something like
> 	ret = true;
> 	goto out;
> here. It will be more consistent.
>

Will do.

THanks.

> > +		}
> >  	}
> > +out:
> > +	rcu_read_unlock();
> >  
> >  	return false;
> >  }
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 6c31ee1e1c9b..59e97a8a64a0 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -430,6 +430,10 @@ int folio_migrate_mapping(struct address_space *mapping,
> >  		struct lruvec *old_lruvec, *new_lruvec;
> >  		struct mem_cgroup *memcg;
> >  
> > +		/*
> > +		 * Irq is disabled, which can serve as RCU read-side critical
> > +		 * sections.
> > +		 */
> >  		memcg = folio_memcg(folio);
> >  		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
> >  		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 89fbf3cae30f..a0d9cd68e87a 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -221,13 +221,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
> >  	struct cgroup_subsys_state *css;
> >  	struct mem_cgroup *memcg;
> >  
> > +	rcu_read_lock();
> >  	memcg = page_memcg(page);
> >  	if (!memcg)
> > -		return;
> > +		goto out;
> >  
> > -	rcu_read_lock();
> >  	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
> >  	bio_associate_blkg_from_css(bio, css);
> > +out:
> >  	rcu_read_unlock();
> >  }
> >  #else
> > -- 
> > 2.11.0
> > 
>
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index 2b5561ae5d0b..80975a457670 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -819,8 +819,7 @@  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
-	/* The page lock pins the memcg */
-	memcg = page_memcg(page);
+	memcg = get_mem_cgroup_from_page(page);
 	old_memcg = set_active_memcg(memcg);
 
 	head = NULL;
@@ -840,6 +839,7 @@  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		set_bh_page(bh, page, offset);
 	}
 out:
+	mem_cgroup_put(memcg);
 	set_active_memcg(old_memcg);
 	return head;
 /*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1fae0196292a..56612ace8778 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -243,15 +243,13 @@  void __inode_attach_wb(struct inode *inode, struct page *page)
 	if (inode_cgwb_enabled(inode)) {
 		struct cgroup_subsys_state *memcg_css;
 
-		if (page) {
-			memcg_css = mem_cgroup_css_from_page(page);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-		} else {
-			/* must pin memcg_css, see wb_get_create() */
+		/* must pin memcg_css, see wb_get_create() */
+		if (page)
+			memcg_css = get_mem_cgroup_css_from_page(page);
+		else
 			memcg_css = task_get_css(current, memory_cgrp_id);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-			css_put(memcg_css);
-		}
+		wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		css_put(memcg_css);
 	}
 
 	if (!wb)
@@ -868,16 +866,16 @@  void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
-	css = mem_cgroup_css_from_page(page);
+	css = get_mem_cgroup_css_from_page(page);
 	/* dead cgroups shouldn't contribute to inode ownership arbitration */
 	if (!(css->flags & CSS_ONLINE))
-		return;
+		goto out;
 
 	id = css->id;
 
 	if (id == wbc->wb_id) {
 		wbc->wb_bytes += bytes;
-		return;
+		goto out;
 	}
 
 	if (id == wbc->wb_lcand_id)
@@ -890,6 +888,9 @@  void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 		wbc->wb_tcand_bytes += bytes;
 	else
 		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+
+out:
+	css_put(css);
 }
 EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8c2f1ba2f471..3a0e2592434e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -373,7 +373,7 @@  static inline bool folio_memcg_kmem(struct folio *folio);
  * a valid memcg, but can be atomically swapped to the parent memcg.
  *
  * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
+ * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
  */
 static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
 {
@@ -454,7 +454,37 @@  static inline struct mem_cgroup *page_memcg(struct page *page)
 	return folio_memcg(page_folio(page));
 }
 
-/**
+/*
+ * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
+ *			       associated with a folio.
+ * @folio: Pointer to the folio.
+ *
+ * Returns a pointer to the memory cgroup (and obtain a reference on it)
+ * associated with the folio, or NULL. This function assumes that the
+ * folio is known to have a proper memory cgroup pointer. It's not safe
+ * to call this function against some type of pages, e.g. slab pages or
+ * ex-slab pages.
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+retry:
+	memcg = folio_memcg(folio);
+	if (unlikely(memcg && !css_tryget(&memcg->css)))
+		goto retry;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+	return get_mem_cgroup_from_folio(page_folio(page));
+}
+
+/*
  * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
  * @folio: Pointer to the folio.
  *
@@ -873,7 +903,7 @@  static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return match;
 }
 
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
 ino_t page_cgroup_ino(struct page *page);
 
 static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -1047,10 +1077,13 @@  static inline void count_memcg_events(struct mem_cgroup *memcg,
 static inline void count_memcg_page_event(struct page *page,
 					  enum vm_event_item idx)
 {
-	struct mem_cgroup *memcg = page_memcg(page);
+	struct mem_cgroup *memcg;
 
+	rcu_read_lock();
+	memcg = page_memcg(page);
 	if (memcg)
 		count_memcg_events(memcg, idx, 1);
+	rcu_read_unlock();
 }
 
 static inline void count_memcg_event_mm(struct mm_struct *mm,
@@ -1129,6 +1162,16 @@  static inline struct mem_cgroup *page_memcg(struct page *page)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+	return NULL;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+	return NULL;
+}
+
 static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 86b2a82da546..cdb822339f13 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -258,6 +258,11 @@  TRACE_EVENT(track_foreign_dirty,
 		__entry->ino		= inode ? inode->i_ino : 0;
 		__entry->memcg_id	= wb->memcg_css->id;
 		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
+		/*
+		 * TP_fast_assign() is under preemption disabled which can
+		 * serve as an RCU read-side critical section so that the
+		 * memcg returned by folio_memcg() cannot be freed.
+		 */
 		__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
 	),
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b38a77f6696f..dcaf6cf5dc74 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -371,7 +371,7 @@  EXPORT_SYMBOL(memcg_kmem_enabled_key);
 #endif
 
 /**
- * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
  * @page: page of interest
  *
  * If memcg is bound to the default hierarchy, css of the memcg associated
@@ -381,13 +381,15 @@  EXPORT_SYMBOL(memcg_kmem_enabled_key);
  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
  * is returned.
  */
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg;
 
-	memcg = page_memcg(page);
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return &root_mem_cgroup->css;
 
-	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+	memcg = get_mem_cgroup_from_page(page);
+	if (!memcg)
 		memcg = root_mem_cgroup;
 
 	return &memcg->css;
@@ -770,13 +772,13 @@  void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 			     int val)
 {
-	struct page *head = compound_head(page); /* rmap on tail pages */
+	struct folio *folio = page_folio(page); /* rmap on tail pages */
 	struct mem_cgroup *memcg;
 	pg_data_t *pgdat = page_pgdat(page);
 	struct lruvec *lruvec;
 
 	rcu_read_lock();
-	memcg = page_memcg(head);
+	memcg = folio_memcg(folio);
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
 		rcu_read_unlock();
@@ -2058,7 +2060,9 @@  void folio_memcg_lock(struct folio *folio)
 	 * The RCU lock is held throughout the transaction.  The fast
 	 * path can get away without acquiring the memcg->move_lock
 	 * because page moving starts with an RCU grace period.
-         */
+	 *
+	 * The RCU lock also protects the memcg from being freed.
+	 */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
@@ -3296,7 +3300,7 @@  void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
 void split_page_memcg(struct page *head, unsigned int nr)
 {
 	struct folio *folio = page_folio(head);
-	struct mem_cgroup *memcg = folio_memcg(folio);
+	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
 	int i;
 
 	if (mem_cgroup_disabled() || !memcg)
@@ -3309,6 +3313,8 @@  void split_page_memcg(struct page *head, unsigned int nr)
 		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
 	else
 		css_get_many(&memcg->css, nr - 1);
+
+	css_put(&memcg->css);
 }
 
 #ifdef CONFIG_MEMCG_SWAP
@@ -4511,7 +4517,7 @@  void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
 					     struct bdi_writeback *wb)
 {
-	struct mem_cgroup *memcg = folio_memcg(folio);
+	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
 	struct memcg_cgwb_frn *frn;
 	u64 now = get_jiffies_64();
 	u64 oldest_at = now;
@@ -4558,6 +4564,7 @@  void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
 		frn->memcg_id = wb->memcg_css->id;
 		frn->at = now;
 	}
+	css_put(&memcg->css);
 }
 
 /* issue foreign writeback flushes for recorded foreign dirtying events */
@@ -6092,6 +6099,14 @@  static void mem_cgroup_move_charge(void)
 	atomic_dec(&mc.from->moving_account);
 }
 
+/*
+ * The cgroup migration and memory cgroup offlining are serialized by
+ * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
+ * be reparented to its parent memory cgroup. So during the whole process
+ * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
+ * need to worry about the memcg (returned from page_memcg()) being
+ * released even if we do not hold an rcu read lock.
+ */
 static void mem_cgroup_move_task(void)
 {
 	if (mc.to) {
@@ -6895,7 +6910,7 @@  void mem_cgroup_migrate(struct folio *old, struct folio *new)
 	if (folio_memcg(new))
 		return;
 
-	memcg = folio_memcg(old);
+	memcg = get_mem_cgroup_from_folio(old);
 	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
 	if (!memcg)
 		return;
@@ -6914,6 +6929,8 @@  void mem_cgroup_migrate(struct folio *old, struct folio *new)
 	mem_cgroup_charge_statistics(memcg, nr_pages);
 	memcg_check_events(memcg, folio_nid(new));
 	local_irq_restore(flags);
+
+	css_put(&memcg->css);
 }
 
 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -7100,6 +7117,10 @@  void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return;
 
+	/*
+	 * Interrupts should be disabled by the caller (see the comments below),
+	 * which can serve as RCU read-side critical sections.
+	 */
 	memcg = folio_memcg(folio);
 
 	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
@@ -7165,15 +7186,16 @@  int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return 0;
 
+	rcu_read_lock();
 	memcg = page_memcg(page);
 
 	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
-		return 0;
+		goto out;
 
 	if (!entry.val) {
 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
-		return 0;
+		goto out;
 	}
 
 	memcg = mem_cgroup_id_get_online(memcg);
@@ -7183,6 +7205,7 @@  int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
 		mem_cgroup_id_put(memcg);
+		rcu_read_unlock();
 		return -ENOMEM;
 	}
 
@@ -7192,6 +7215,8 @@  int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
 	VM_BUG_ON_PAGE(oldid, page);
 	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+out:
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -7246,17 +7271,22 @@  bool mem_cgroup_swap_full(struct page *page)
 	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return false;
 
+	rcu_read_lock();
 	memcg = page_memcg(page);
 	if (!memcg)
-		return false;
+		goto out;
 
 	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
 		unsigned long usage = page_counter_read(&memcg->swap);
 
 		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
-		    usage * 2 >= READ_ONCE(memcg->swap.max))
+		    usage * 2 >= READ_ONCE(memcg->swap.max)) {
+			rcu_read_unlock();
 			return true;
+		}
 	}
+out:
+	rcu_read_unlock();
 
 	return false;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c31ee1e1c9b..59e97a8a64a0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -430,6 +430,10 @@  int folio_migrate_mapping(struct address_space *mapping,
 		struct lruvec *old_lruvec, *new_lruvec;
 		struct mem_cgroup *memcg;
 
+		/*
+		 * Irq is disabled, which can serve as RCU read-side critical
+		 * sections.
+		 */
 		memcg = folio_memcg(folio);
 		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
 		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
diff --git a/mm/page_io.c b/mm/page_io.c
index 89fbf3cae30f..a0d9cd68e87a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -221,13 +221,14 @@  static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 	struct cgroup_subsys_state *css;
 	struct mem_cgroup *memcg;
 
+	rcu_read_lock();
 	memcg = page_memcg(page);
 	if (!memcg)
-		return;
+		goto out;
 
-	rcu_read_lock();
 	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
 	bio_associate_blkg_from_css(bio, css);
+out:
 	rcu_read_unlock();
 }
 #else