diff mbox series

[v3,3/5] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed.

Message ID 20220217094802.3644569-4-bigeasy@linutronix.de (mailing list archive)
State New
Headers show
Series mm/memcg: Address PREEMPT_RT problems instead of disabling it. | expand

Commit Message

Sebastian Andrzej Siewior Feb. 17, 2022, 9:48 a.m. UTC
The per-CPU counter are modified with the non-atomic modifier. The
consistency is ensured by disabling interrupts for the update.
On non PREEMPT_RT configuration this works because acquiring a
spinlock_t typed lock with the _irq() suffix disables interrupts. On
PREEMPT_RT configurations the RMW operation can be interrupted.

Another problem is that mem_cgroup_swapout() expects to be invoked with
disabled interrupts because the caller has to acquire a spinlock_t which
is acquired with disabled interrupts. Since spinlock_t never disables
interrupts on PREEMPT_RT the interrupts are never disabled at this
point.

The code is never called from in_irq() context on PREEMPT_RT therefore
disabling preemption during the update is sufficient on PREEMPT_RT.
The sections which explicitly disable interrupts can remain on
PREEMPT_RT because the sections remain short and they don't involve
sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).

Disable preemption during update of the per-CPU variables which do not
explicitly disable interrupts.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Roman Gushchin <guro@fb.com>
---
 mm/memcontrol.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

Comments

Shakeel Butt Feb. 18, 2022, 5:25 p.m. UTC | #1
On Thu, Feb 17, 2022 at 1:48 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> The per-CPU counter are modified with the non-atomic modifier. The
> consistency is ensured by disabling interrupts for the update.
> On non PREEMPT_RT configuration this works because acquiring a
> spinlock_t typed lock with the _irq() suffix disables interrupts. On
> PREEMPT_RT configurations the RMW operation can be interrupted.
>
> Another problem is that mem_cgroup_swapout() expects to be invoked with
> disabled interrupts because the caller has to acquire a spinlock_t which
> is acquired with disabled interrupts. Since spinlock_t never disables
> interrupts on PREEMPT_RT the interrupts are never disabled at this
> point.
>
> The code is never called from in_irq() context on PREEMPT_RT therefore
> disabling preemption during the update is sufficient on PREEMPT_RT.
> The sections which explicitly disable interrupts can remain on
> PREEMPT_RT because the sections remain short and they don't involve
> sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).
>
> Disable preemption during update of the per-CPU variables which do not
> explicitly disable interrupts.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Acked-by: Roman Gushchin <guro@fb.com>
> ---
>  mm/memcontrol.c | 29 ++++++++++++++++++++++++++++-
>  1 file changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 0b5117ed2ae08..36ab3660f2c6d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -630,6 +630,28 @@ static DEFINE_SPINLOCK(stats_flush_lock);
>  static DEFINE_PER_CPU(unsigned int, stats_updates);
>  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
>
> +/*
> + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
> + * not rely on this as part of an acquired spinlock_t lock. These functions are
> + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
> + * is sufficient.
> + */
> +static void memcg_stats_lock(void)
> +{
> +#ifdef CONFIG_PREEMPT_RT
> +      preempt_disable();
> +#else
> +      VM_BUG_ON(!irqs_disabled());
> +#endif
> +}
> +
> +static void memcg_stats_unlock(void)
> +{
> +#ifdef CONFIG_PREEMPT_RT
> +      preempt_enable();
> +#endif
> +}
> +
>  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
>  {
>         unsigned int x;
> @@ -706,6 +728,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
>         memcg = pn->memcg;)
>
> +       memcg_stats_lock();

The call chains from rmap.c have not really disabled irqs. Actually
there is a comment in do_page_add_anon_rmap() "We use the irq-unsafe
__{inc|mod}_zone_page_stat because these counters are not modified in
interrupt context, and pte lock(a spinlock) is held, which implies
preemption disabled".

VM_BUG_ON(!irqs_disabled()) within memcg_stats_lock() would be giving
false error reports for CONFIG_PREEMPT_NONE kernels.
Sebastian Andrzej Siewior Feb. 21, 2022, 11:31 a.m. UTC | #2
On 2022-02-18 09:25:29 [-0800], Shakeel Butt wrote:
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 0b5117ed2ae08..36ab3660f2c6d 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -630,6 +630,28 @@ static DEFINE_SPINLOCK(stats_flush_lock);
> >  static DEFINE_PER_CPU(unsigned int, stats_updates);
> >  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
> >
> > +/*
> > + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
> > + * not rely on this as part of an acquired spinlock_t lock. These functions are
> > + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
> > + * is sufficient.
> > + */
> > +static void memcg_stats_lock(void)
> > +{
> > +#ifdef CONFIG_PREEMPT_RT
> > +      preempt_disable();
> > +#else
> > +      VM_BUG_ON(!irqs_disabled());
> > +#endif
> > +}
> > +
> > +static void memcg_stats_unlock(void)
> > +{
> > +#ifdef CONFIG_PREEMPT_RT
> > +      preempt_enable();
> > +#endif
> > +}
> > +
> >  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
> >  {
> >         unsigned int x;
> > @@ -706,6 +728,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> >         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> >         memcg = pn->memcg;)
> >
> > +       memcg_stats_lock();
> 
> The call chains from rmap.c have not really disabled irqs. Actually
> there is a comment in do_page_add_anon_rmap() "We use the irq-unsafe
> __{inc|mod}_zone_page_stat because these counters are not modified in
> interrupt context, and pte lock(a spinlock) is held, which implies
> preemption disabled".
> 
> VM_BUG_ON(!irqs_disabled()) within memcg_stats_lock() would be giving
> false error reports for CONFIG_PREEMPT_NONE kernels.

So three caller, including do_page_add_anon_rmap():
   __mod_lruvec_page_state() -> __mod_lruvec_state() -> __mod_memcg_lruvec_state()

is affected. Here we get false warnings because interrupts may not be
disabled and it is intended. Hmmm.
The odd part is that this only affects certain idx so any kind of
additional debugging would need to take this into account.
What about memcg_rstat_updated()? It does:

|         x = __this_cpu_add_return(stats_updates, abs(val));
|         if (x > MEMCG_CHARGE_BATCH) {
|                 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
|                 __this_cpu_write(stats_updates, 0);
|         }

The writes to stats_updates can happen from IRQ-context and with
disabled preemption only. So this is not good, right?

Sebastian
Sebastian Andrzej Siewior Feb. 21, 2022, 12:12 p.m. UTC | #3
On 2022-02-21 12:31:18 [+0100], To Shakeel Butt wrote:
> > The call chains from rmap.c have not really disabled irqs. Actually
> > there is a comment in do_page_add_anon_rmap() "We use the irq-unsafe
> > __{inc|mod}_zone_page_stat because these counters are not modified in
> > interrupt context, and pte lock(a spinlock) is held, which implies
> > preemption disabled".
> > 
> > VM_BUG_ON(!irqs_disabled()) within memcg_stats_lock() would be giving
> > false error reports for CONFIG_PREEMPT_NONE kernels.
> 
> So three caller, including do_page_add_anon_rmap():
>    __mod_lruvec_page_state() -> __mod_lruvec_state() -> __mod_memcg_lruvec_state()
> 
> is affected. Here we get false warnings because interrupts may not be
> disabled and it is intended. Hmmm.
> The odd part is that this only affects certain idx so any kind of
> additional debugging would need to take this into account.
> What about memcg_rstat_updated()? It does:
> 
> |         x = __this_cpu_add_return(stats_updates, abs(val));
> |         if (x > MEMCG_CHARGE_BATCH) {
> |                 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
> |                 __this_cpu_write(stats_updates, 0);
> |         }
> 
> The writes to stats_updates can happen from IRQ-context and with
> disabled preemption only. So this is not good, right?

So I made the following to avoid the wrong assert. Still not sure how
bad the hunk above.

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 97a88b63ee983..1bac4798b72ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -645,6 +645,13 @@ static void memcg_stats_lock(void)
 #endif
 }
 
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#endif
+}
+
 static void memcg_stats_unlock(void)
 {
 #ifdef CONFIG_PREEMPT_RT
@@ -728,7 +735,20 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 	memcg = pn->memcg;
 
-	memcg_stats_lock();
+	/*
+	 * The caller from rmap relay on disabled preemption becase they never
+	 * update their counter from in-interrupt context. For these two
+	 * counters we check that the update is never performed from an
+	 * interrupt context while other caller need to have disabled interrupt.
+	 */
+	__memcg_stats_lock();
+	if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+		if (idx == NR_ANON_MAPPED || idx == NR_FILE_MAPPED)
+			WARN_ON_ONCE(!in_task());
+		else
+			WARN_ON_ONCE(!irqs_disabled());
+	}
+
 	/* Update memcg */
 	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 

Sebastian
Michal Koutný Feb. 21, 2022, 1:18 p.m. UTC | #4
On Mon, Feb 21, 2022 at 12:31:17PM +0100, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> What about memcg_rstat_updated()? It does:
> 
> |         x = __this_cpu_add_return(stats_updates, abs(val));
> |         if (x > MEMCG_CHARGE_BATCH) {
> |                 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
> |                 __this_cpu_write(stats_updates, 0);
> |         }
> 
> The writes to stats_updates can happen from IRQ-context and with
> disabled preemption only. So this is not good, right?

These counters serve as a hint for aggregating per-cpu per-cgroup stats.
If they were systematically mis-updated, it could manifest by
missing "refresh signal" from the given CPU. OTOH, this lagging is also
meant to by limited by elapsed time thanks to periodic flushing.

This could affect freshness of the stats not their accuracy though.


HTH,
Michal
Sebastian Andrzej Siewior Feb. 21, 2022, 1:58 p.m. UTC | #5
On 2022-02-21 14:18:25 [+0100], Michal Koutný wrote:
> On Mon, Feb 21, 2022 at 12:31:17PM +0100, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > What about memcg_rstat_updated()? It does:
> > 
> > |         x = __this_cpu_add_return(stats_updates, abs(val));
> > |         if (x > MEMCG_CHARGE_BATCH) {
> > |                 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
> > |                 __this_cpu_write(stats_updates, 0);
> > |         }
> > 
> > The writes to stats_updates can happen from IRQ-context and with
> > disabled preemption only. So this is not good, right?
> 
> These counters serve as a hint for aggregating per-cpu per-cgroup stats.
> If they were systematically mis-updated, it could manifest by
> missing "refresh signal" from the given CPU. OTOH, this lagging is also
> meant to by limited by elapsed time thanks to periodic flushing.
> 
> This could affect freshness of the stats not their accuracy though.

Oki. Then let me update the code as suggested and ignore this case since
it is nothing to worry about.

> HTH,
> Michal

Sebastian
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b5117ed2ae08..36ab3660f2c6d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -630,6 +630,28 @@  static DEFINE_SPINLOCK(stats_flush_lock);
 static DEFINE_PER_CPU(unsigned int, stats_updates);
 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
 
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#else
+      VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_enable();
+#endif
+}
+
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 {
 	unsigned int x;
@@ -706,6 +728,7 @@  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 	memcg = pn->memcg;
 
+	memcg_stats_lock();
 	/* Update memcg */
 	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 
@@ -713,6 +736,7 @@  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
 
 	memcg_rstat_updated(memcg, val);
+	memcg_stats_unlock();
 }
 
 /**
@@ -795,8 +819,10 @@  void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	if (mem_cgroup_disabled())
 		return;
 
+	memcg_stats_lock();
 	__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
 	memcg_rstat_updated(memcg, count);
+	memcg_stats_unlock();
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -7140,8 +7166,9 @@  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * important here to have the interrupts disabled because it is the
 	 * only synchronisation we have for updating the per-CPU variables.
 	 */
-	VM_BUG_ON(!irqs_disabled());
+	memcg_stats_lock();
 	mem_cgroup_charge_statistics(memcg, -nr_entries);
+	memcg_stats_unlock();
 	memcg_check_events(memcg, page_to_nid(page));
 
 	css_put(&memcg->css);