diff mbox series

[RFC,2/3] proc/kpagecgroup: report also inode numbers of offline cgroups

Message ID 153414348994.737150.10057219558779418929.stgit@buzz (mailing list archive)
State New, archived
Headers show
Series [RFC,1/3] cgroup: list all subsystem states in debugfs files | expand

Commit Message

Konstantin Khlebnikov Aug. 13, 2018, 6:58 a.m. UTC
By default this interface reports inode number of closest online ancestor
if cgroups is offline (removed). Information about real owner is required
for detecting which pages keep removed cgroup.

This patch adds per-file mode which is changed by writing 64-bit flags
into opened /proc/kpagecgroup. For now only first bit is used.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 Documentation/admin-guide/mm/pagemap.rst |    3 +++
 fs/proc/page.c                           |   24 ++++++++++++++++++++++--
 include/linux/memcontrol.h               |    2 +-
 mm/memcontrol.c                          |    5 +++--
 mm/memory-failure.c                      |    2 +-
 5 files changed, 30 insertions(+), 6 deletions(-)

Comments

Tejun Heo Aug. 22, 2018, 2:58 p.m. UTC | #1
Hello,

On Mon, Aug 13, 2018 at 09:58:10AM +0300, Konstantin Khlebnikov wrote:
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 19a4348974a4..7ef6ea9d5e4a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -333,6 +333,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
>  /**
>   * page_cgroup_ino - return inode number of the memcg a page is charged to
>   * @page: the page
> + * @online: return closest online ancestor
>   *
>   * Look up the closest online ancestor of the memory cgroup @page is charged to
>   * and return its inode number or 0 if @page is not charged to any cgroup. It
> @@ -343,14 +344,14 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
>   * after page_cgroup_ino() returns, so it only should be used by callers that
>   * do not care (such as procfs interfaces).
>   */
> -ino_t page_cgroup_ino(struct page *page)
> +ino_t page_cgroup_ino(struct page *page, bool online)
>  {
>  	struct mem_cgroup *memcg;
>  	unsigned long ino = 0;
>  
>  	rcu_read_lock();
>  	memcg = READ_ONCE(page->mem_cgroup);
> -	while (memcg && !(memcg->css.flags & CSS_ONLINE))
> +	while (memcg && online && !(memcg->css.flags & CSS_ONLINE))
>  		memcg = parent_mem_cgroup(memcg);
>  	if (memcg)
>  		ino = cgroup_ino(memcg->css.cgroup);

We pin the ino till the cgroup is actually released now but that's an
implementation detail which may change in the future, so I'm not sure
this is a good idea.  Can you instead use the 64bit filehandle exposed
by kernfs?  That's currently also based on ino (+gen) but it's
something guarnateed to stay unique per cgroup and you can easily get
to the cgroup using the fh too.

Thanks.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 577af85beb41..b39d841ac560 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -80,6 +80,9 @@  There are four components to pagemap:
    memory cgroup each page is charged to, indexed by PFN. Only available when
    CONFIG_MEMCG is set.
 
+   For offline (removed) cgroup this returnes inode number of closest online
+   ancestor. Write 64-bit flag 1 into opened file for getting real owners.
+
 Short descriptions to the page flags
 ====================================
 
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 792c78a49174..337f526fcc27 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -248,6 +248,7 @@  static const struct file_operations proc_kpageflags_operations = {
 static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	unsigned long flags = (unsigned long)file->private_data;
 	u64 __user *out = (u64 __user *)buf;
 	struct page *ppage;
 	unsigned long src = *ppos;
@@ -267,7 +268,7 @@  static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 			ppage = NULL;
 
 		if (ppage)
-			ino = page_cgroup_ino(ppage);
+			ino = page_cgroup_ino(ppage, !(flags & 1));
 		else
 			ino = 0;
 
@@ -289,9 +290,28 @@  static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t kpagecgroup_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 flags;
+
+	if (count != 8)
+		return -EINVAL;
+
+	if (get_user(flags, buf))
+		return -EFAULT;
+
+	if (flags > 1)
+		return -EINVAL;
+
+	file->private_data = (void *)(unsigned long)flags;
+	return count;
+}
+
 static const struct file_operations proc_kpagecgroup_operations = {
 	.llseek = mem_lseek,
 	.read = kpagecgroup_read,
+	.write = kpagecgroup_write,
 };
 #endif /* CONFIG_MEMCG */
 
@@ -300,7 +320,7 @@  static int __init proc_page_init(void)
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
 #ifdef CONFIG_MEMCG
-	proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+	proc_create("kpagecgroup", 0600, NULL, &proc_kpagecgroup_operations);
 #endif
 	return 0;
 }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..a7c40522bef0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -444,7 +444,7 @@  static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
-ino_t page_cgroup_ino(struct page *page);
+ino_t page_cgroup_ino(struct page *page, bool online);
 
 static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19a4348974a4..7ef6ea9d5e4a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -333,6 +333,7 @@  struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 /**
  * page_cgroup_ino - return inode number of the memcg a page is charged to
  * @page: the page
+ * @online: return closest online ancestor
  *
  * Look up the closest online ancestor of the memory cgroup @page is charged to
  * and return its inode number or 0 if @page is not charged to any cgroup. It
@@ -343,14 +344,14 @@  struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
  * after page_cgroup_ino() returns, so it only should be used by callers that
  * do not care (such as procfs interfaces).
  */
-ino_t page_cgroup_ino(struct page *page)
+ino_t page_cgroup_ino(struct page *page, bool online)
 {
 	struct mem_cgroup *memcg;
 	unsigned long ino = 0;
 
 	rcu_read_lock();
 	memcg = READ_ONCE(page->mem_cgroup);
-	while (memcg && !(memcg->css.flags & CSS_ONLINE))
+	while (memcg && online && !(memcg->css.flags & CSS_ONLINE))
 		memcg = parent_mem_cgroup(memcg);
 	if (memcg)
 		ino = cgroup_ino(memcg->css.cgroup);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9b86dc..bd09c447e0ec 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -139,7 +139,7 @@  static int hwpoison_filter_task(struct page *p)
 	if (!hwpoison_filter_memcg)
 		return 0;
 
-	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+	if (page_cgroup_ino(p, true) != hwpoison_filter_memcg)
 		return -EINVAL;
 
 	return 0;