diff mbox series

[v5,3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed.

Message ID 20220226204144.1008339-4-bigeasy@linutronix.de (mailing list archive)
State New
Headers show
Series mm/memcg: Address PREEMPT_RT problems instead of disabling it. | expand

Commit Message

Sebastian Andrzej Siewior Feb. 26, 2022, 8:41 p.m. UTC
The per-CPU counter are modified with the non-atomic modifier. The
consistency is ensured by disabling interrupts for the update.
On non PREEMPT_RT configuration this works because acquiring a
spinlock_t typed lock with the _irq() suffix disables interrupts. On
PREEMPT_RT configurations the RMW operation can be interrupted.

Another problem is that mem_cgroup_swapout() expects to be invoked with
disabled interrupts because the caller has to acquire a spinlock_t which
is acquired with disabled interrupts. Since spinlock_t never disables
interrupts on PREEMPT_RT the interrupts are never disabled at this
point.

The code is never called from in_irq() context on PREEMPT_RT therefore
disabling preemption during the update is sufficient on PREEMPT_RT.
The sections which explicitly disable interrupts can remain on
PREEMPT_RT because the sections remain short and they don't involve
sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).

Disable preemption during update of the per-CPU variables which do not
explicitly disable interrupts.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com
---
 mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

Comments

Michal Hocko Feb. 28, 2022, 8:05 a.m. UTC | #1
On Sat 26-02-22 21:41:41, Sebastian Andrzej Siewior wrote:
> The per-CPU counter are modified with the non-atomic modifier. The
> consistency is ensured by disabling interrupts for the update.
> On non PREEMPT_RT configuration this works because acquiring a
> spinlock_t typed lock with the _irq() suffix disables interrupts. On
> PREEMPT_RT configurations the RMW operation can be interrupted.
> 
> Another problem is that mem_cgroup_swapout() expects to be invoked with
> disabled interrupts because the caller has to acquire a spinlock_t which
> is acquired with disabled interrupts. Since spinlock_t never disables
> interrupts on PREEMPT_RT the interrupts are never disabled at this
> point.
> 
> The code is never called from in_irq() context on PREEMPT_RT therefore
> disabling preemption during the update is sufficient on PREEMPT_RT.
> The sections which explicitly disable interrupts can remain on
> PREEMPT_RT because the sections remain short and they don't involve
> sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).
> 
> Disable preemption during update of the per-CPU variables which do not
> explicitly disable interrupts.
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Acked-by: Roman Gushchin <guro@fb.com>
> Reviewed-by: Shakeel Butt <shakeelb@google.com

Acked-by: Michal Hocko <mhocko@suse.com>

TBH I am not a fan of the counter special casing for the debugging
enabled warnings but I do not feel strong enough to push you trhough an
additional version round.

Thanks!

> ---
>  mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 0b5117ed2ae08..238ea77aade5d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -630,6 +630,35 @@ static DEFINE_SPINLOCK(stats_flush_lock);
>  static DEFINE_PER_CPU(unsigned int, stats_updates);
>  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
>  
> +/*
> + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
> + * not rely on this as part of an acquired spinlock_t lock. These functions are
> + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
> + * is sufficient.
> + */
> +static void memcg_stats_lock(void)
> +{
> +#ifdef CONFIG_PREEMPT_RT
> +      preempt_disable();
> +#else
> +      VM_BUG_ON(!irqs_disabled());
> +#endif
> +}
> +
> +static void __memcg_stats_lock(void)
> +{
> +#ifdef CONFIG_PREEMPT_RT
> +      preempt_disable();
> +#endif
> +}
> +
> +static void memcg_stats_unlock(void)
> +{
> +#ifdef CONFIG_PREEMPT_RT
> +      preempt_enable();
> +#endif
> +}
> +
>  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
>  {
>  	unsigned int x;
> @@ -706,6 +735,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
>  	memcg = pn->memcg;
>  
> +	/*
> +	 * The caller from rmap relay on disabled preemption becase they never
> +	 * update their counter from in-interrupt context. For these two
> +	 * counters we check that the update is never performed from an
> +	 * interrupt context while other caller need to have disabled interrupt.
> +	 */
> +	__memcg_stats_lock();
> +	if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
> +		switch (idx) {
> +		case NR_ANON_MAPPED:
> +		case NR_FILE_MAPPED:
> +		case NR_ANON_THPS:
> +		case NR_SHMEM_PMDMAPPED:
> +		case NR_FILE_PMDMAPPED:
> +			WARN_ON_ONCE(!in_task());
> +			break;
> +		default:
> +			WARN_ON_ONCE(!irqs_disabled());
> +		}
> +	}
> +
>  	/* Update memcg */
>  	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
>  
> @@ -713,6 +763,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
>  
>  	memcg_rstat_updated(memcg, val);
> +	memcg_stats_unlock();
>  }
>  
>  /**
> @@ -795,8 +846,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
>  	if (mem_cgroup_disabled())
>  		return;
>  
> +	memcg_stats_lock();
>  	__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
>  	memcg_rstat_updated(memcg, count);
> +	memcg_stats_unlock();
>  }
>  
>  static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
> @@ -7140,8 +7193,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  	 * important here to have the interrupts disabled because it is the
>  	 * only synchronisation we have for updating the per-CPU variables.
>  	 */
> -	VM_BUG_ON(!irqs_disabled());
> +	memcg_stats_lock();
>  	mem_cgroup_charge_statistics(memcg, -nr_entries);
> +	memcg_stats_unlock();
>  	memcg_check_events(memcg, page_to_nid(page));
>  
>  	css_put(&memcg->css);
> -- 
> 2.35.1
Sebastian Andrzej Siewior Feb. 28, 2022, 11:08 a.m. UTC | #2
On 2022-02-28 09:05:45 [+0100], Michal Hocko wrote:
> Acked-by: Michal Hocko <mhocko@suse.com>
> 
> TBH I am not a fan of the counter special casing for the debugging
> enabled warnings but I do not feel strong enough to push you trhough an
> additional version round.

do you want to get rid of the warnings completely? Since we had the
check in memcg_stats_lock() it kinda felt useful to add something in
__memcg_stats_lock() case, too.

> Thanks!

Sebastian
Michal Hocko Feb. 28, 2022, 11:23 a.m. UTC | #3
On Mon 28-02-22 12:08:40, Sebastian Andrzej Siewior wrote:
> On 2022-02-28 09:05:45 [+0100], Michal Hocko wrote:
> > Acked-by: Michal Hocko <mhocko@suse.com>
> > 
> > TBH I am not a fan of the counter special casing for the debugging
> > enabled warnings but I do not feel strong enough to push you trhough an
> > additional version round.
> 
> do you want to get rid of the warnings completely? Since we had the
> check in memcg_stats_lock() it kinda felt useful to add something in
> __memcg_stats_lock() case, too.

The thing that I dislike is that there is no other way for potential
users of those counters to know these expectations. This is not
documented anywhere so it mostly describes the _current_ state of the
code which might change in the future. We can be more thorough and
document counters wrt. to the context they can be used in which would
make this less of a concern of course. But this can be done on top hence
I do not want to push you for another versions. It is good to have your
current work merged in the next merge window as long as there are no
fallouts and for that it would be good to have it in the linux next and
exposed for some more testing rather than dealing with this deatails.
Sebastian Andrzej Siewior Feb. 28, 2022, 12:35 p.m. UTC | #4
On 2022-02-28 12:23:12 [+0100], Michal Hocko wrote:
> On Mon 28-02-22 12:08:40, Sebastian Andrzej Siewior wrote:
> > On 2022-02-28 09:05:45 [+0100], Michal Hocko wrote:
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > > 
> > > TBH I am not a fan of the counter special casing for the debugging
> > > enabled warnings but I do not feel strong enough to push you trhough an
> > > additional version round.
> > 
> > do you want to get rid of the warnings completely? Since we had the
> > check in memcg_stats_lock() it kinda felt useful to add something in
> > __memcg_stats_lock() case, too.
> 
> The thing that I dislike is that there is no other way for potential
> users of those counters to know these expectations. This is not
> documented anywhere so it mostly describes the _current_ state of the
> code which might change in the future. We can be more thorough and
> document counters wrt. to the context they can be used in which would
> make this less of a concern of course. But this can be done on top hence
> I do not want to push you for another versions. It is good to have your
> current work merged in the next merge window as long as there are no
> fallouts and for that it would be good to have it in the linux next and
> exposed for some more testing rather than dealing with this deatails.

Okay. Then I would suggest then I document the usage context of these
counters once things settled down. Now there is no validation at all, so
if someone uses the wrong context for a counter and you don't spot it
during the review then there will be no warning at runtime.

Sebastian
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b5117ed2ae08..238ea77aade5d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -630,6 +630,35 @@  static DEFINE_SPINLOCK(stats_flush_lock);
 static DEFINE_PER_CPU(unsigned int, stats_updates);
 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
 
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#else
+      VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_enable();
+#endif
+}
+
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 {
 	unsigned int x;
@@ -706,6 +735,27 @@  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 	memcg = pn->memcg;
 
+	/*
+	 * The caller from rmap relay on disabled preemption becase they never
+	 * update their counter from in-interrupt context. For these two
+	 * counters we check that the update is never performed from an
+	 * interrupt context while other caller need to have disabled interrupt.
+	 */
+	__memcg_stats_lock();
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		switch (idx) {
+		case NR_ANON_MAPPED:
+		case NR_FILE_MAPPED:
+		case NR_ANON_THPS:
+		case NR_SHMEM_PMDMAPPED:
+		case NR_FILE_PMDMAPPED:
+			WARN_ON_ONCE(!in_task());
+			break;
+		default:
+			WARN_ON_ONCE(!irqs_disabled());
+		}
+	}
+
 	/* Update memcg */
 	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 
@@ -713,6 +763,7 @@  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
 
 	memcg_rstat_updated(memcg, val);
+	memcg_stats_unlock();
 }
 
 /**
@@ -795,8 +846,10 @@  void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	if (mem_cgroup_disabled())
 		return;
 
+	memcg_stats_lock();
 	__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
 	memcg_rstat_updated(memcg, count);
+	memcg_stats_unlock();
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -7140,8 +7193,9 @@  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * important here to have the interrupts disabled because it is the
 	 * only synchronisation we have for updating the per-CPU variables.
 	 */
-	VM_BUG_ON(!irqs_disabled());
+	memcg_stats_lock();
 	mem_cgroup_charge_statistics(memcg, -nr_entries);
+	memcg_stats_unlock();
 	memcg_check_events(memcg, page_to_nid(page));
 
 	css_put(&memcg->css);