diff mbox series

[RFC,v2,2/8] mm: make zone_reclaim_stat updates thread-safe

Message ID 20180911004240.4758-3-daniel.m.jordan@oracle.com (mailing list archive)
State New, archived
Headers show
Series lru_lock scalability and SMP list functions | expand

Commit Message

Daniel Jordan Sept. 11, 2018, 12:42 a.m. UTC
lru_lock needs to be held to update the zone_reclaim_stat statistics.
Similar to the previous patch, this requirement again arises fairly
naturally because callers are holding lru_lock already.

In preparation for allowing concurrent adds and removes from the LRU,
however, make concurrent updates to these statistics safe without
lru_lock.  The lock continues to be held until later in the series, when
it is replaced with a rwlock that also disables preemption, maintaining
the assumption in the comment above __update_page_reclaim_stat, which is
introduced here.

Use a combination of per-cpu counters and atomics.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 include/linux/mmzone.h | 50 ++++++++++++++++++++++++++++++++++++++++++
 init/main.c            |  1 +
 mm/memcontrol.c        | 20 ++++++++---------
 mm/memory_hotplug.c    |  1 +
 mm/mmzone.c            | 14 ++++++++++++
 mm/swap.c              | 14 ++++++++----
 mm/vmscan.c            | 42 ++++++++++++++++++++---------------
 7 files changed, 110 insertions(+), 32 deletions(-)

Comments

Laurent Dufour Sept. 11, 2018, 4:40 p.m. UTC | #1
On 11/09/2018 02:42, Daniel Jordan wrote:
> lru_lock needs to be held to update the zone_reclaim_stat statistics.
> Similar to the previous patch, this requirement again arises fairly
> naturally because callers are holding lru_lock already.
> 
> In preparation for allowing concurrent adds and removes from the LRU,
> however, make concurrent updates to these statistics safe without
> lru_lock.  The lock continues to be held until later in the series, when
> it is replaced with a rwlock that also disables preemption, maintaining
> the assumption in the comment above __update_page_reclaim_stat, which is
> introduced here.
> 
> Use a combination of per-cpu counters and atomics.
> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> ---
>  include/linux/mmzone.h | 50 ++++++++++++++++++++++++++++++++++++++++++
>  init/main.c            |  1 +
>  mm/memcontrol.c        | 20 ++++++++---------
>  mm/memory_hotplug.c    |  1 +
>  mm/mmzone.c            | 14 ++++++++++++
>  mm/swap.c              | 14 ++++++++----
>  mm/vmscan.c            | 42 ++++++++++++++++++++---------------
>  7 files changed, 110 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2dc52a..6d4c23a3069d 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -229,6 +229,12 @@ struct zone_reclaim_stat {
>  	 *
>  	 * The anon LRU stats live in [0], file LRU stats in [1]
>  	 */
> +	atomic_long_t		recent_rotated[2];
> +	atomic_long_t		recent_scanned[2];

It might be better to use a slightly different name for these fields to
distinguish them from the ones in the zone_reclaim_stat_cpu structure.

> +};
> +
> +/* These spill into the counters in struct zone_reclaim_stat beyond a cutoff. */
> +struct zone_reclaim_stat_cpu {
>  	unsigned long		recent_rotated[2];
>  	unsigned long		recent_scanned[2];
>  };
> @@ -236,6 +242,7 @@ struct zone_reclaim_stat {
>  struct lruvec {
>  	struct list_head		lists[NR_LRU_LISTS];
>  	struct zone_reclaim_stat	reclaim_stat;
> +	struct zone_reclaim_stat_cpu __percpu *reclaim_stat_cpu;
>  	/* Evictions & activations on the inactive file list */
>  	atomic_long_t			inactive_age;
>  	/* Refaults at the time of last reclaim cycle */
> @@ -245,6 +252,47 @@ struct lruvec {
>  #endif
>  };
> 
> +#define	RECLAIM_STAT_BATCH	32U	/* From SWAP_CLUSTER_MAX */
> +
> +/*
> + * Callers of the below three functions that update reclaim stats must hold
> + * lru_lock and have preemption disabled.  Use percpu counters that spill into
> + * atomics to allow concurrent updates when multiple readers hold lru_lock.
> + */
> +
> +static inline void __update_page_reclaim_stat(unsigned long count,
> +					      unsigned long *percpu_stat,
> +					      atomic_long_t *stat)
> +{
> +	unsigned long val = *percpu_stat + count;
> +
> +	if (unlikely(val > RECLAIM_STAT_BATCH)) {
> +		atomic_long_add(val, stat);
> +		val = 0;
> +	}
> +	*percpu_stat = val;
> +}
> +
> +static inline void update_reclaim_stat_scanned(struct lruvec *lruvec, int file,
> +					       unsigned long count)
> +{
> +	struct zone_reclaim_stat_cpu __percpu *percpu_stat =
> +					 this_cpu_ptr(lruvec->reclaim_stat_cpu);
> +
> +	__update_page_reclaim_stat(count, &percpu_stat->recent_scanned[file],
> +				   &lruvec->reclaim_stat.recent_scanned[file]);
> +}
> +
> +static inline void update_reclaim_stat_rotated(struct lruvec *lruvec, int file,
> +					       unsigned long count)
> +{
> +	struct zone_reclaim_stat_cpu __percpu *percpu_stat =
> +					 this_cpu_ptr(lruvec->reclaim_stat_cpu);
> +
> +	__update_page_reclaim_stat(count, &percpu_stat->recent_rotated[file],
> +				   &lruvec->reclaim_stat.recent_rotated[file]);
> +}
> +
>  /* Mask used at gathering information at once (see memcontrol.c) */
>  #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
>  #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
> @@ -795,6 +843,8 @@ extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn
>  				     unsigned long size);
> 
>  extern void lruvec_init(struct lruvec *lruvec);
> +extern void lruvec_init_late(struct lruvec *lruvec);
> +extern void lruvecs_init_late(void);
> 
>  static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
>  {
> diff --git a/init/main.c b/init/main.c
> index 3b4ada11ed52..80ad02fe99de 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -526,6 +526,7 @@ static void __init mm_init(void)
>  	init_espfix_bsp();
>  	/* Should be run after espfix64 is set up. */
>  	pti_init();
> +	lruvecs_init_late();
>  }
> 
>  asmlinkage __visible void __init start_kernel(void)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 5463ad160e10..f7f9682482cd 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3152,22 +3152,22 @@ static int memcg_stat_show(struct seq_file *m, void *v)
>  		pg_data_t *pgdat;
>  		struct mem_cgroup_per_node *mz;
>  		struct zone_reclaim_stat *rstat;
> -		unsigned long recent_rotated[2] = {0, 0};
> -		unsigned long recent_scanned[2] = {0, 0};
> +		unsigned long rota[2] = {0, 0};
> +		unsigned long scan[2] = {0, 0};
> 
>  		for_each_online_pgdat(pgdat) {
>  			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
>  			rstat = &mz->lruvec.reclaim_stat;
> 
> -			recent_rotated[0] += rstat->recent_rotated[0];
> -			recent_rotated[1] += rstat->recent_rotated[1];
> -			recent_scanned[0] += rstat->recent_scanned[0];
> -			recent_scanned[1] += rstat->recent_scanned[1];
> +			rota[0] += atomic_long_read(&rstat->recent_rotated[0]);
> +			rota[1] += atomic_long_read(&rstat->recent_rotated[1]);
> +			scan[0] += atomic_long_read(&rstat->recent_scanned[0]);
> +			scan[1] += atomic_long_read(&rstat->recent_scanned[1]);
>  		}
> -		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
> -		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
> -		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
> -		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
> +		seq_printf(m, "recent_rotated_anon %lu\n", rota[0]);
> +		seq_printf(m, "recent_rotated_file %lu\n", rota[1]);
> +		seq_printf(m, "recent_scanned_anon %lu\n", scan[0]);
> +		seq_printf(m, "recent_scanned_file %lu\n", scan[1]);
>  	}
>  #endif
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 25982467800b..d3ebb11c3f9f 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1009,6 +1009,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
>  	/* init node's zones as empty zones, we don't have any present pages.*/
>  	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
>  	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
> +	lruvec_init_late(node_lruvec(pgdat));
> 
>  	/*
>  	 * The node we allocated has no zone fallback lists. For avoiding
> diff --git a/mm/mmzone.c b/mm/mmzone.c
> index 4686fdc23bb9..090cd4f7effb 100644
> --- a/mm/mmzone.c
> +++ b/mm/mmzone.c
> @@ -9,6 +9,7 @@
>  #include <linux/stddef.h>
>  #include <linux/mm.h>
>  #include <linux/mmzone.h>
> +#include <linux/percpu.h>
> 
>  struct pglist_data *first_online_pgdat(void)
>  {
> @@ -96,6 +97,19 @@ void lruvec_init(struct lruvec *lruvec)
>  		INIT_LIST_HEAD(&lruvec->lists[lru]);
>  }
> 
> +void lruvec_init_late(struct lruvec *lruvec)
> +{
> +	lruvec->reclaim_stat_cpu = alloc_percpu(struct zone_reclaim_stat_cpu);
> +}
> +
> +void lruvecs_init_late(void)
> +{
> +	pg_data_t *pgdat;
> +
> +	for_each_online_pgdat(pgdat)
> +		lruvec_init_late(node_lruvec(pgdat));
> +}
> +
>  #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
>  int page_cpupid_xchg_last(struct page *page, int cpupid)
>  {
> diff --git a/mm/swap.c b/mm/swap.c
> index 3dd518832096..219c234d632f 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -34,6 +34,7 @@
>  #include <linux/uio.h>
>  #include <linux/hugetlb.h>
>  #include <linux/page_idle.h>
> +#include <linux/mmzone.h>
> 
>  #include "internal.h"
> 
> @@ -260,14 +261,19 @@ void rotate_reclaimable_page(struct page *page)
>  	}
>  }
> 
> +/*
> + * Updates page reclaim statistics using per-cpu counters that spill into
> + * atomics above a threshold.
> + *
> + * Assumes that the caller has disabled preemption.  IRQs may be enabled
> + * because this function is not called from irq context.
> + */
>  static void update_page_reclaim_stat(struct lruvec *lruvec,
>  				     int file, int rotated)
>  {
> -	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
> -
> -	reclaim_stat->recent_scanned[file]++;
> +	update_reclaim_stat_scanned(lruvec, file, 1);
>  	if (rotated)
> -		reclaim_stat->recent_rotated[file]++;
> +		update_reclaim_stat_rotated(lruvec, file, 1);
>  }
> 
>  static void __activate_page(struct page *page, struct lruvec *lruvec,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9270a4370d54..730b6d0c6c61 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1655,7 +1655,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
>  static noinline_for_stack void
>  putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
>  {
> -	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>  	LIST_HEAD(pages_to_free);
> 
> @@ -1684,7 +1683,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
>  		if (is_active_lru(lru)) {
>  			int file = is_file_lru(lru);
>  			int numpages = hpage_nr_pages(page);
> -			reclaim_stat->recent_rotated[file] += numpages;
> +			update_reclaim_stat_rotated(lruvec, file, numpages);
>  		}
>  		if (put_page_testzero(page)) {
>  			__ClearPageLRU(page);
> @@ -1736,7 +1735,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	isolate_mode_t isolate_mode = 0;
>  	int file = is_file_lru(lru);
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> -	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
>  	bool stalled = false;
> 
>  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
> @@ -1763,7 +1761,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  				     &nr_scanned, sc, isolate_mode, lru);
> 
>  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
> -	reclaim_stat->recent_scanned[file] += nr_taken;
> +	update_reclaim_stat_scanned(lruvec, file, nr_taken);
> 
>  	if (current_is_kswapd()) {
>  		if (global_reclaim(sc))
> @@ -1914,7 +1912,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	LIST_HEAD(l_active);
>  	LIST_HEAD(l_inactive);
>  	struct page *page;
> -	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
>  	unsigned nr_deactivate, nr_activate;
>  	unsigned nr_rotated = 0;
>  	isolate_mode_t isolate_mode = 0;
> @@ -1932,7 +1929,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  				     &nr_scanned, sc, isolate_mode, lru);
> 
>  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
> -	reclaim_stat->recent_scanned[file] += nr_taken;
> +	update_reclaim_stat_scanned(lruvec, file, nr_taken);
> 
>  	__count_vm_events(PGREFILL, nr_scanned);
>  	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
> @@ -1989,7 +1986,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	 * helps balance scan pressure between file and anonymous pages in
>  	 * get_scan_count.
>  	 */
> -	reclaim_stat->recent_rotated[file] += nr_rotated;
> +	update_reclaim_stat_rotated(lruvec, file, nr_rotated);
> 
>  	nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
>  	nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
> @@ -2116,7 +2113,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  			   unsigned long *lru_pages)
>  {
>  	int swappiness = mem_cgroup_swappiness(memcg);
> -	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
> +	struct zone_reclaim_stat *rstat = &lruvec->reclaim_stat;
>  	u64 fraction[2];
>  	u64 denominator = 0;	/* gcc */
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> @@ -2125,6 +2122,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  	unsigned long anon, file;
>  	unsigned long ap, fp;
>  	enum lru_list lru;
> +	long recent_scanned[2], recent_rotated[2];
> 
>  	/* If we have no swap space, do not bother scanning anon pages. */
>  	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
> @@ -2238,14 +2236,22 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
> 
>  	spin_lock_irq(&pgdat->lru_lock);
> -	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
> -		reclaim_stat->recent_scanned[0] /= 2;
> -		reclaim_stat->recent_rotated[0] /= 2;
> +	recent_scanned[0] = atomic_long_read(&rstat->recent_scanned[0]);
> +	recent_rotated[0] = atomic_long_read(&rstat->recent_rotated[0]);
> +	if (unlikely(recent_scanned[0] > anon / 4)) {
> +		recent_scanned[0] /= 2;
> +		recent_rotated[0] /= 2;
> +		atomic_long_set(&rstat->recent_scanned[0], recent_scanned[0]);
> +		atomic_long_set(&rstat->recent_rotated[0], recent_rotated[0]);
>  	}
> 
> -	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
> -		reclaim_stat->recent_scanned[1] /= 2;
> -		reclaim_stat->recent_rotated[1] /= 2;
> +	recent_scanned[1] = atomic_long_read(&rstat->recent_scanned[1]);
> +	recent_rotated[1] = atomic_long_read(&rstat->recent_rotated[1]);
> +	if (unlikely(recent_scanned[1] > file / 4)) {
> +		recent_scanned[1] /= 2;
> +		recent_rotated[1] /= 2;
> +		atomic_long_set(&rstat->recent_scanned[1], recent_scanned[1]);
> +		atomic_long_set(&rstat->recent_rotated[1], recent_rotated[1]);
>  	}
> 
>  	/*
> @@ -2253,11 +2259,11 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  	 * proportional to the fraction of recently scanned pages on
>  	 * each list that were recently referenced and in active use.
>  	 */
> -	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
> -	ap /= reclaim_stat->recent_rotated[0] + 1;
> +	ap = anon_prio * (recent_scanned[0] + 1);
> +	ap /= recent_rotated[0] + 1;
> 
> -	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
> -	fp /= reclaim_stat->recent_rotated[1] + 1;
> +	fp = file_prio * (recent_scanned[1] + 1);
> +	fp /= recent_rotated[1] + 1;
>  	spin_unlock_irq(&pgdat->lru_lock);
> 
>  	fraction[0] = ap;
>
Daniel Jordan Sept. 12, 2018, 1:30 p.m. UTC | #2
On 9/11/18 12:40 PM, Laurent Dufour wrote:
> On 11/09/2018 02:42, Daniel Jordan wrote:
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 32699b2dc52a..6d4c23a3069d 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -229,6 +229,12 @@ struct zone_reclaim_stat {
>>   	 *
>>   	 * The anon LRU stats live in [0], file LRU stats in [1]
>>   	 */
>> +	atomic_long_t		recent_rotated[2];
>> +	atomic_long_t		recent_scanned[2];
> 
> It might be better to use a slightly different name for these fields to
> distinguish them from the ones in the zone_reclaim_stat_cpu structure.

Sure, these are now named recent_rotated_cpu and recent_scanned_cpu, absent better names.

Thanks for your comments.
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2dc52a..6d4c23a3069d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -229,6 +229,12 @@  struct zone_reclaim_stat {
 	 *
 	 * The anon LRU stats live in [0], file LRU stats in [1]
 	 */
+	atomic_long_t		recent_rotated[2];
+	atomic_long_t		recent_scanned[2];
+};
+
+/* These spill into the counters in struct zone_reclaim_stat beyond a cutoff. */
+struct zone_reclaim_stat_cpu {
 	unsigned long		recent_rotated[2];
 	unsigned long		recent_scanned[2];
 };
@@ -236,6 +242,7 @@  struct zone_reclaim_stat {
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
 	struct zone_reclaim_stat	reclaim_stat;
+	struct zone_reclaim_stat_cpu __percpu *reclaim_stat_cpu;
 	/* Evictions & activations on the inactive file list */
 	atomic_long_t			inactive_age;
 	/* Refaults at the time of last reclaim cycle */
@@ -245,6 +252,47 @@  struct lruvec {
 #endif
 };
 
+#define	RECLAIM_STAT_BATCH	32U	/* From SWAP_CLUSTER_MAX */
+
+/*
+ * Callers of the below three functions that update reclaim stats must hold
+ * lru_lock and have preemption disabled.  Use percpu counters that spill into
+ * atomics to allow concurrent updates when multiple readers hold lru_lock.
+ */
+
+static inline void __update_page_reclaim_stat(unsigned long count,
+					      unsigned long *percpu_stat,
+					      atomic_long_t *stat)
+{
+	unsigned long val = *percpu_stat + count;
+
+	if (unlikely(val > RECLAIM_STAT_BATCH)) {
+		atomic_long_add(val, stat);
+		val = 0;
+	}
+	*percpu_stat = val;
+}
+
+static inline void update_reclaim_stat_scanned(struct lruvec *lruvec, int file,
+					       unsigned long count)
+{
+	struct zone_reclaim_stat_cpu __percpu *percpu_stat =
+					 this_cpu_ptr(lruvec->reclaim_stat_cpu);
+
+	__update_page_reclaim_stat(count, &percpu_stat->recent_scanned[file],
+				   &lruvec->reclaim_stat.recent_scanned[file]);
+}
+
+static inline void update_reclaim_stat_rotated(struct lruvec *lruvec, int file,
+					       unsigned long count)
+{
+	struct zone_reclaim_stat_cpu __percpu *percpu_stat =
+					 this_cpu_ptr(lruvec->reclaim_stat_cpu);
+
+	__update_page_reclaim_stat(count, &percpu_stat->recent_rotated[file],
+				   &lruvec->reclaim_stat.recent_rotated[file]);
+}
+
 /* Mask used at gathering information at once (see memcontrol.c) */
 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
@@ -795,6 +843,8 @@  extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn
 				     unsigned long size);
 
 extern void lruvec_init(struct lruvec *lruvec);
+extern void lruvec_init_late(struct lruvec *lruvec);
+extern void lruvecs_init_late(void);
 
 static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 {
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..80ad02fe99de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -526,6 +526,7 @@  static void __init mm_init(void)
 	init_espfix_bsp();
 	/* Should be run after espfix64 is set up. */
 	pti_init();
+	lruvecs_init_late();
 }
 
 asmlinkage __visible void __init start_kernel(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5463ad160e10..f7f9682482cd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3152,22 +3152,22 @@  static int memcg_stat_show(struct seq_file *m, void *v)
 		pg_data_t *pgdat;
 		struct mem_cgroup_per_node *mz;
 		struct zone_reclaim_stat *rstat;
-		unsigned long recent_rotated[2] = {0, 0};
-		unsigned long recent_scanned[2] = {0, 0};
+		unsigned long rota[2] = {0, 0};
+		unsigned long scan[2] = {0, 0};
 
 		for_each_online_pgdat(pgdat) {
 			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
 			rstat = &mz->lruvec.reclaim_stat;
 
-			recent_rotated[0] += rstat->recent_rotated[0];
-			recent_rotated[1] += rstat->recent_rotated[1];
-			recent_scanned[0] += rstat->recent_scanned[0];
-			recent_scanned[1] += rstat->recent_scanned[1];
+			rota[0] += atomic_long_read(&rstat->recent_rotated[0]);
+			rota[1] += atomic_long_read(&rstat->recent_rotated[1]);
+			scan[0] += atomic_long_read(&rstat->recent_scanned[0]);
+			scan[1] += atomic_long_read(&rstat->recent_scanned[1]);
 		}
-		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
-		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
-		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
-		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+		seq_printf(m, "recent_rotated_anon %lu\n", rota[0]);
+		seq_printf(m, "recent_rotated_file %lu\n", rota[1]);
+		seq_printf(m, "recent_scanned_anon %lu\n", scan[0]);
+		seq_printf(m, "recent_scanned_file %lu\n", scan[1]);
 	}
 #endif
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 25982467800b..d3ebb11c3f9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1009,6 +1009,7 @@  static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	/* init node's zones as empty zones, we don't have any present pages.*/
 	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
 	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+	lruvec_init_late(node_lruvec(pgdat));
 
 	/*
 	 * The node we allocated has no zone fallback lists. For avoiding
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..090cd4f7effb 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -9,6 +9,7 @@ 
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/percpu.h>
 
 struct pglist_data *first_online_pgdat(void)
 {
@@ -96,6 +97,19 @@  void lruvec_init(struct lruvec *lruvec)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
 }
 
+void lruvec_init_late(struct lruvec *lruvec)
+{
+	lruvec->reclaim_stat_cpu = alloc_percpu(struct zone_reclaim_stat_cpu);
+}
+
+void lruvecs_init_late(void)
+{
+	pg_data_t *pgdat;
+
+	for_each_online_pgdat(pgdat)
+		lruvec_init_late(node_lruvec(pgdat));
+}
+
 #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
 int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..219c234d632f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,7 @@ 
 #include <linux/uio.h>
 #include <linux/hugetlb.h>
 #include <linux/page_idle.h>
+#include <linux/mmzone.h>
 
 #include "internal.h"
 
@@ -260,14 +261,19 @@  void rotate_reclaimable_page(struct page *page)
 	}
 }
 
+/*
+ * Updates page reclaim statistics using per-cpu counters that spill into
+ * atomics above a threshold.
+ *
+ * Assumes that the caller has disabled preemption.  IRQs may be enabled
+ * because this function is not called from irq context.
+ */
 static void update_page_reclaim_stat(struct lruvec *lruvec,
 				     int file, int rotated)
 {
-	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-
-	reclaim_stat->recent_scanned[file]++;
+	update_reclaim_stat_scanned(lruvec, file, 1);
 	if (rotated)
-		reclaim_stat->recent_rotated[file]++;
+		update_reclaim_stat_rotated(lruvec, file, 1);
 }
 
 static void __activate_page(struct page *page, struct lruvec *lruvec,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..730b6d0c6c61 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1655,7 +1655,6 @@  static int too_many_isolated(struct pglist_data *pgdat, int file,
 static noinline_for_stack void
 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
-	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	LIST_HEAD(pages_to_free);
 
@@ -1684,7 +1683,7 @@  putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 		if (is_active_lru(lru)) {
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
-			reclaim_stat->recent_rotated[file] += numpages;
+			update_reclaim_stat_rotated(lruvec, file, numpages);
 		}
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
@@ -1736,7 +1735,6 @@  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	bool stalled = false;
 
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
@@ -1763,7 +1761,7 @@  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 				     &nr_scanned, sc, isolate_mode, lru);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
-	reclaim_stat->recent_scanned[file] += nr_taken;
+	update_reclaim_stat_scanned(lruvec, file, nr_taken);
 
 	if (current_is_kswapd()) {
 		if (global_reclaim(sc))
@@ -1914,7 +1912,6 @@  static void shrink_active_list(unsigned long nr_to_scan,
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
-	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	unsigned nr_deactivate, nr_activate;
 	unsigned nr_rotated = 0;
 	isolate_mode_t isolate_mode = 0;
@@ -1932,7 +1929,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 				     &nr_scanned, sc, isolate_mode, lru);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
-	reclaim_stat->recent_scanned[file] += nr_taken;
+	update_reclaim_stat_scanned(lruvec, file, nr_taken);
 
 	__count_vm_events(PGREFILL, nr_scanned);
 	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
@@ -1989,7 +1986,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 	 * helps balance scan pressure between file and anonymous pages in
 	 * get_scan_count.
 	 */
-	reclaim_stat->recent_rotated[file] += nr_rotated;
+	update_reclaim_stat_rotated(lruvec, file, nr_rotated);
 
 	nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
 	nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
@@ -2116,7 +2113,7 @@  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			   unsigned long *lru_pages)
 {
 	int swappiness = mem_cgroup_swappiness(memcg);
-	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+	struct zone_reclaim_stat *rstat = &lruvec->reclaim_stat;
 	u64 fraction[2];
 	u64 denominator = 0;	/* gcc */
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2125,6 +2122,7 @@  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	unsigned long anon, file;
 	unsigned long ap, fp;
 	enum lru_list lru;
+	long recent_scanned[2], recent_rotated[2];
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2238,14 +2236,22 @@  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
 	spin_lock_irq(&pgdat->lru_lock);
-	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-		reclaim_stat->recent_scanned[0] /= 2;
-		reclaim_stat->recent_rotated[0] /= 2;
+	recent_scanned[0] = atomic_long_read(&rstat->recent_scanned[0]);
+	recent_rotated[0] = atomic_long_read(&rstat->recent_rotated[0]);
+	if (unlikely(recent_scanned[0] > anon / 4)) {
+		recent_scanned[0] /= 2;
+		recent_rotated[0] /= 2;
+		atomic_long_set(&rstat->recent_scanned[0], recent_scanned[0]);
+		atomic_long_set(&rstat->recent_rotated[0], recent_rotated[0]);
 	}
 
-	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-		reclaim_stat->recent_scanned[1] /= 2;
-		reclaim_stat->recent_rotated[1] /= 2;
+	recent_scanned[1] = atomic_long_read(&rstat->recent_scanned[1]);
+	recent_rotated[1] = atomic_long_read(&rstat->recent_rotated[1]);
+	if (unlikely(recent_scanned[1] > file / 4)) {
+		recent_scanned[1] /= 2;
+		recent_rotated[1] /= 2;
+		atomic_long_set(&rstat->recent_scanned[1], recent_scanned[1]);
+		atomic_long_set(&rstat->recent_rotated[1], recent_rotated[1]);
 	}
 
 	/*
@@ -2253,11 +2259,11 @@  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
-	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
-	ap /= reclaim_stat->recent_rotated[0] + 1;
+	ap = anon_prio * (recent_scanned[0] + 1);
+	ap /= recent_rotated[0] + 1;
 
-	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
-	fp /= reclaim_stat->recent_rotated[1] + 1;
+	fp = file_prio * (recent_scanned[1] + 1);
+	fp /= recent_rotated[1] + 1;
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	fraction[0] = ap;