@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};
-struct lruvec_stat {
- long count[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct batched_lruvec_stat {
- s32 count[NR_VM_NODE_STAT_ITEMS];
-};
-
/*
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
* shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
unsigned long *map;
};
+struct lruvec_stats_percpu {
+ /* Local (CPU and cgroup) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+ /* Aggregated (CPU and subtree) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
/*
* per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
- /*
- * Legacy local VM stats. This should be struct lruvec_stat and
- * cannot be optimized to struct batched_lruvec_stat. Because
- * the threshold of the lruvec_stat_cpu can be as big as
- * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
- * filed has no upper limit.
- */
- struct lruvec_stat __percpu *lruvec_stat_local;
-
- /* Subtree VM stats (batched updates) */
- struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
- atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
+ struct lruvec_stats lruvec_stats;
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = atomic_long_read(&pn->lruvec_stat[idx]);
+ x = READ_ONCE(pn->lruvec_stats.state[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
+ x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -660,23 +660,11 @@ static unsigned long memcg_page_state_lo
return x;
}
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
- struct mem_cgroup *parent;
-
- parent = parent_mem_cgroup(pn->memcg);
- if (!parent)
- return NULL;
- return parent->nodeinfo[nid];
-}
-
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lru
__mod_memcg_state(memcg, idx, val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
- if (vmstat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > threshold)) {
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
- struct mem_cgroup_per_node *pi;
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
- atomic_long_add(x, &pi->lruvec_stat[idx]);
- x = 0;
- }
- __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
}
/**
@@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_c
mutex_unlock(&percpu_charge_mutex);
}
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
- int nid;
-
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
- unsigned long stat[NR_VM_NODE_STAT_ITEMS];
- struct batched_lruvec_stat *lstatc;
- int i;
-
- lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- stat[i] = lstatc->count[i];
- lstatc->count[i] = 0;
- }
-
- do {
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
-}
-
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- for_each_mem_cgroup(memcg)
- memcg_flush_lruvec_page_state(memcg, cpu);
-
return 0;
}
@@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_inf
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_local) {
- kfree(pn);
- return 1;
- }
-
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_cpu) {
- free_percpu(pn->lruvec_stat_local);
+ pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
@@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_inf
if (!pn)
return;
- free_percpu(pn->lruvec_stat_cpu);
- free_percpu(pn->lruvec_stat_local);
+ free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
- int cpu;
-
memcg_wb_domain_exit(memcg);
- /*
- * Flush percpu lruvec stats to guarantee the value
- * correctness on parent's and all ancestor levels.
- */
- for_each_online_cpu(cpu)
- memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(s
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
long delta, v;
- int i;
+ int i, nid;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(s
if (parent)
parent->vmstats.events_pending[i] += delta;
}
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats_percpu *lstatc;
+
+ if (parent)
+ ppn = parent->nodeinfo[nid];
+
+ lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ delta = pn->lruvec_stats.state_pending[i];
+ if (delta)
+ pn->lruvec_stats.state_pending[i] = 0;
+
+ v = READ_ONCE(lstatc->state[i]);
+ if (v != lstatc->state_prev[i]) {
+ delta += v - lstatc->state_prev[i];
+ lstatc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
+ }
}
#ifdef CONFIG_MMU
@@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;