@@ -1333,11 +1333,14 @@ The following nested keys are defined.
all the existing limitations and potential future extensions.
memory.peak
- A read-only single value file which exists on non-root
- cgroups.
+ A read-write single value file which exists on non-root cgroups.
+
+ The max memory usage recorded for the cgroup and its descendants since
+ either the creation of the cgroup or the most recent reset for that FD.
- The max memory usage recorded for the cgroup and its
- descendants since the creation of the cgroup.
+ A write of any non-empty string to this file resets it to the
+ current memory usage for subsequent reads through the same
+ file descriptor.
memory.oom.group
A read-write single value file which exists on non-root
@@ -1663,11 +1666,14 @@ The following nested keys are defined.
Healthy workloads are not expected to reach this limit.
memory.swap.peak
- A read-only single value file which exists on non-root
- cgroups.
+ A read-write single value file which exists on non-root cgroups.
+
+ The max swap usage recorded for the cgroup and its descendants since
+ the creation of the cgroup or the most recent reset for that FD.
- The max swap usage recorded for the cgroup and its
- descendants since the creation of the cgroup.
+ A write of any non-empty string to this file resets it to the
+ current memory usage for subsequent reads through the same
+ file descriptor.
memory.swap.max
A read-write single value file which exists on non-root
@@ -775,6 +775,11 @@ struct cgroup_subsys {
extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+struct cgroup_of_peak {
+ unsigned long value;
+ struct list_head list;
+};
+
/**
* cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
* @tsk: target task
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/nodemask.h>
+#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
@@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);
+
#endif /* _LINUX_CGROUP_H */
@@ -193,6 +193,11 @@ struct mem_cgroup {
struct page_counter memsw; /* v1 only */
};
+ /* registered local peak watchers */
+ struct list_head memory_peaks;
+ struct list_head swap_peaks;
+ spinlock_t peaks_lock;
+
/* Range enforcement for interrupt charges */
struct work_struct high_work;
@@ -26,6 +26,8 @@ struct page_counter {
atomic_long_t children_low_usage;
unsigned long watermark;
+ /* Latest cg2 reset watermark */
+ unsigned long local_watermark;
unsigned long failcnt;
/* Keep all the read most fields in a separete cacheline. */
@@ -78,7 +80,14 @@ int page_counter_memparse(const char *buf, const char *max,
static inline void page_counter_reset_watermark(struct page_counter *counter)
{
- counter->watermark = page_counter_read(counter);
+ unsigned long usage = page_counter_read(counter);
+
+ /*
+ * Update local_watermark first, so it's always <= watermark
+ * (modulo CPU/compiler re-ordering)
+ */
+ counter->local_watermark = usage;
+ counter->watermark = usage;
}
void page_counter_calculate_protection(struct page_counter *root,
@@ -81,6 +81,8 @@ struct cgroup_file_ctx {
struct {
struct cgroup_pidlist *pidlist;
} procs1;
+
+ struct cgroup_of_peak peak;
};
/*
@@ -1972,6 +1972,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
return -EINVAL;
}
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
@@ -25,6 +25,7 @@
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
+#include <linux/cgroup-defs.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
@@ -41,6 +42,7 @@
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
+#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -3558,6 +3560,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_WORK(&memcg->high_work, high_work_func);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->memory_peaks);
+ INIT_LIST_HEAD(&memcg->swap_peaks);
+ spin_lock_init(&memcg->peaks_lock);
memcg->socket_pressure = jiffies;
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
@@ -3950,14 +3955,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
-static u64 memory_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+#define OFP_PEAK_UNSET (((-1UL)))
+
+static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cgroup_of_peak *ofp = of_peak(sf->private);
+ u64 fd_peak = READ_ONCE(ofp->value), peak;
+
+ /* User wants global or local peak? */
+ if (fd_peak == OFP_PEAK_UNSET)
+ peak = pc->watermark;
+ else
+ peak = max(fd_peak, READ_ONCE(pc->local_watermark));
+
+ seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
+ return 0;
+}
+
+static int memory_peak_show(struct seq_file *sf, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+ return peak_show(sf, v, &memcg->memory);
+}
+
+static int peak_open(struct kernfs_open_file *of)
+{
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ ofp->value = OFP_PEAK_UNSET;
+ return 0;
+}
+
+static void peak_release(struct kernfs_open_file *of)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ if (ofp->value == OFP_PEAK_UNSET) {
+ /* fast path (no writes on this fd) */
+ return;
+ }
+ spin_lock(&memcg->peaks_lock);
+ list_del(&ofp->list);
+ spin_unlock(&memcg->peaks_lock);
+}
+
+static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off, struct page_counter *pc,
+ struct list_head *watchers)
+{
+ unsigned long usage;
+ struct cgroup_of_peak *peer_ctx;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ spin_lock(&memcg->peaks_lock);
+
+ usage = page_counter_read(pc);
+ WRITE_ONCE(pc->local_watermark, usage);
+
+ list_for_each_entry(peer_ctx, watchers, list)
+ if (usage > peer_ctx->value)
+ WRITE_ONCE(peer_ctx->value, usage);
+
+ /* initial write, register watcher */
+ if (ofp->value == -1)
+ list_add(&ofp->list, watchers);
+
+ WRITE_ONCE(ofp->value, usage);
+ spin_unlock(&memcg->peaks_lock);
+
+ return nbytes;
+}
+
+static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- return (u64)memcg->memory.watermark * PAGE_SIZE;
+ return peak_write(of, buf, nbytes, off, &memcg->memory,
+ &memcg->memory_peaks);
}
+#undef OFP_PEAK_UNSET
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -4307,7 +4389,10 @@ static struct cftype memory_files[] = {
{
.name = "peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = memory_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = memory_peak_show,
+ .write = memory_peak_write,
},
{
.name = "min",
@@ -5099,12 +5184,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
-static u64 swap_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int swap_peak_show(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+ return peak_show(sf, v, &memcg->swap);
+}
+
+static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- return (u64)memcg->swap.watermark * PAGE_SIZE;
+ return peak_write(of, buf, nbytes, off, &memcg->swap,
+ &memcg->swap_peaks);
}
static int swap_high_show(struct seq_file *m, void *v)
@@ -5188,7 +5281,10 @@ static struct cftype swap_files[] = {
{
.name = "swap.peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = swap_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = swap_peak_show,
+ .write = swap_peak_write,
},
{
.name = "swap.events",
@@ -79,9 +79,22 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
+ *
+ * Notably, we have two watermarks to allow for both a globally
+ * visible peak and one that can be reset at a smaller scope.
+ *
+ * Since we reset both watermarks when the global reset occurs,
+ * we can guarantee that watermark >= local_watermark, so we
+ * don't need to do both comparisons every time.
+ *
+ * On systems with branch predictors, the inner condition should
+ * be almost free.
*/
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+ if (new > READ_ONCE(c->local_watermark)) {
+ WRITE_ONCE(c->local_watermark, new);
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
}
}
@@ -129,12 +142,13 @@ bool page_counter_try_charge(struct page_counter *counter,
goto failed;
}
propagate_protected_usage(c, new);
- /*
- * Just like with failcnt, we can live with some
- * inaccuracy in the watermark.
- */
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+
+ /* see comment on page_counter_charge */
+ if (new > READ_ONCE(c->local_watermark)) {
+ WRITE_ONCE(c->local_watermark, new);
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
}
return true;