diff mbox

[RESEND,7/8] pipe: account to kmemcg

Message ID 2c2545563b6201f118946f96dd8cfc90e564aff6.1464079538.git.vdavydov@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vladimir Davydov May 24, 2016, 8:49 a.m. UTC
Pipes can consume a significant amount of system memory, hence they
should be accounted to kmemcg.

This patch marks pipe_inode_info and anonymous pipe buffer page
allocations as __GFP_ACCOUNT so that they would be charged to kmemcg.
Note, since a pipe buffer page can be "stolen" and get reused for other
purposes, including mapping to userspace, we clear PageKmemcg thus
resetting page->_mapcount and uncharge it in anon_pipe_buf_steal, which
is introduced by this patch.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

Comments

Eric Dumazet May 24, 2016, 12:59 p.m. UTC | #1
On Tue, 2016-05-24 at 11:49 +0300, Vladimir Davydov wrote:
> Pipes can consume a significant amount of system memory, hence they
> should be accounted to kmemcg.
> 
> This patch marks pipe_inode_info and anonymous pipe buffer page
> allocations as __GFP_ACCOUNT so that they would be charged to kmemcg.
> Note, since a pipe buffer page can be "stolen" and get reused for other
> purposes, including mapping to userspace, we clear PageKmemcg thus
> resetting page->_mapcount and uncharge it in anon_pipe_buf_steal, which
> is introduced by this patch.
> 
> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> ---
>  fs/pipe.c | 32 ++++++++++++++++++++++++++------
>  1 file changed, 26 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 0d3f5165cb0b..4b32928f5426 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -21,6 +21,7 @@
>  #include <linux/audit.h>
>  #include <linux/syscalls.h>
>  #include <linux/fcntl.h>
> +#include <linux/memcontrol.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/ioctls.h>
> @@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
>  		put_page(page);
>  }
>  
> +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> +			       struct pipe_buffer *buf)
> +{
> +	struct page *page = buf->page;
> +
> +	if (page_count(page) == 1) {

This looks racy : some cpu could have temporarily elevated page count.

> +		if (memcg_kmem_enabled()) {
> +			memcg_kmem_uncharge(page, 0);
> +			__ClearPageKmemcg(page);
> +		}
> +		__SetPageLocked(page);
> +		return 0;
> +	}
> +	return 1;
> +}
> +




--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladimir Davydov May 24, 2016, 4:13 p.m. UTC | #2
On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
...
> > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > +			       struct pipe_buffer *buf)
> > +{
> > +	struct page *page = buf->page;
> > +
> > +	if (page_count(page) == 1) {
> 
> This looks racy : some cpu could have temporarily elevated page count.

All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
with refcount of 1 in ->steal, that means that we are the only its user
and it can't be spliced to another pipe.

In fact, I just copied the code from generic_pipe_buf_steal, adding
kmemcg related checks along the way, so it should be fine.

Thanks,
Vladimir

> 
> > +		if (memcg_kmem_enabled()) {
> > +			memcg_kmem_uncharge(page, 0);
> > +			__ClearPageKmemcg(page);
> > +		}
> > +		__SetPageLocked(page);
> > +		return 0;
> > +	}
> > +	return 1;
> > +}
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet May 24, 2016, 8:04 p.m. UTC | #3
On Tue, 2016-05-24 at 19:13 +0300, Vladimir Davydov wrote:
> On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
> ...
> > > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > > +			       struct pipe_buffer *buf)
> > > +{
> > > +	struct page *page = buf->page;
> > > +
> > > +	if (page_count(page) == 1) {
> > 
> > This looks racy : some cpu could have temporarily elevated page count.
> 
> All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
> supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
> with refcount of 1 in ->steal, that means that we are the only its user
> and it can't be spliced to another pipe.
> 
> In fact, I just copied the code from generic_pipe_buf_steal, adding
> kmemcg related checks along the way, so it should be fine.

So you guarantee that no other cpu might have done
get_page_unless_zero() right before this test ?



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladimir Davydov May 25, 2016, 10:30 a.m. UTC | #4
On Tue, May 24, 2016 at 01:04:33PM -0700, Eric Dumazet wrote:
> On Tue, 2016-05-24 at 19:13 +0300, Vladimir Davydov wrote:
> > On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
> > ...
> > > > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > > > +			       struct pipe_buffer *buf)
> > > > +{
> > > > +	struct page *page = buf->page;
> > > > +
> > > > +	if (page_count(page) == 1) {
> > > 
> > > This looks racy : some cpu could have temporarily elevated page count.
> > 
> > All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
> > supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
> > with refcount of 1 in ->steal, that means that we are the only its user
> > and it can't be spliced to another pipe.
> > 
> > In fact, I just copied the code from generic_pipe_buf_steal, adding
> > kmemcg related checks along the way, so it should be fine.
> 
> So you guarantee that no other cpu might have done
> get_page_unless_zero() right before this test ?

Each pipe_buffer holds a reference to its page. If we find page's
refcount to be 1 here, then it can be referenced only by our
pipe_buffer. And the refcount cannot be increased by a parallel thread,
because we hold pipe_lock, which rules out splice, and otherwise it's
impossible to reach the page as it is not on lru. That said, I think I
guarantee that this should be safe.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Minchan Kim May 26, 2016, 7:04 a.m. UTC | #5
On Wed, May 25, 2016 at 01:30:11PM +0300, Vladimir Davydov wrote:
> On Tue, May 24, 2016 at 01:04:33PM -0700, Eric Dumazet wrote:
> > On Tue, 2016-05-24 at 19:13 +0300, Vladimir Davydov wrote:
> > > On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
> > > ...
> > > > > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > > > > +			       struct pipe_buffer *buf)
> > > > > +{
> > > > > +	struct page *page = buf->page;
> > > > > +
> > > > > +	if (page_count(page) == 1) {
> > > > 
> > > > This looks racy : some cpu could have temporarily elevated page count.
> > > 
> > > All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
> > > supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
> > > with refcount of 1 in ->steal, that means that we are the only its user
> > > and it can't be spliced to another pipe.
> > > 
> > > In fact, I just copied the code from generic_pipe_buf_steal, adding
> > > kmemcg related checks along the way, so it should be fine.
> > 
> > So you guarantee that no other cpu might have done
> > get_page_unless_zero() right before this test ?
> 
> Each pipe_buffer holds a reference to its page. If we find page's
> refcount to be 1 here, then it can be referenced only by our
> pipe_buffer. And the refcount cannot be increased by a parallel thread,
> because we hold pipe_lock, which rules out splice, and otherwise it's
> impossible to reach the page as it is not on lru. That said, I think I
> guarantee that this should be safe.

I don't know kmemcg internal and pipe stuff so my comment might be
totally crap.

No one cannot guarantee any CPU cannot held a reference of a page.
Look at get_page_unless_zero usecases.

1. balloon_page_isolate

It can hold a reference in random page and then verify the page
is balloon page. Otherwise, just put.

2. page_idle_get_page

It has PageLRU check but it's racy so it can hold a reference
of randome page and then verify within zone->lru_lock. If it's
not LRU page, just put.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladimir Davydov May 26, 2016, 1:59 p.m. UTC | #6
On Thu, May 26, 2016 at 04:04:55PM +0900, Minchan Kim wrote:
> On Wed, May 25, 2016 at 01:30:11PM +0300, Vladimir Davydov wrote:
> > On Tue, May 24, 2016 at 01:04:33PM -0700, Eric Dumazet wrote:
> > > On Tue, 2016-05-24 at 19:13 +0300, Vladimir Davydov wrote:
> > > > On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
> > > > ...
> > > > > > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > > > > > +			       struct pipe_buffer *buf)
> > > > > > +{
> > > > > > +	struct page *page = buf->page;
> > > > > > +
> > > > > > +	if (page_count(page) == 1) {
> > > > > 
> > > > > This looks racy : some cpu could have temporarily elevated page count.
> > > > 
> > > > All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
> > > > supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
> > > > with refcount of 1 in ->steal, that means that we are the only its user
> > > > and it can't be spliced to another pipe.
> > > > 
> > > > In fact, I just copied the code from generic_pipe_buf_steal, adding
> > > > kmemcg related checks along the way, so it should be fine.
> > > 
> > > So you guarantee that no other cpu might have done
> > > get_page_unless_zero() right before this test ?
> > 
> > Each pipe_buffer holds a reference to its page. If we find page's
> > refcount to be 1 here, then it can be referenced only by our
> > pipe_buffer. And the refcount cannot be increased by a parallel thread,
> > because we hold pipe_lock, which rules out splice, and otherwise it's
> > impossible to reach the page as it is not on lru. That said, I think I
> > guarantee that this should be safe.
> 
> I don't know kmemcg internal and pipe stuff so my comment might be
> totally crap.
> 
> No one cannot guarantee any CPU cannot held a reference of a page.
> Look at get_page_unless_zero usecases.
> 
> 1. balloon_page_isolate
> 
> It can hold a reference in random page and then verify the page
> is balloon page. Otherwise, just put.
> 
> 2. page_idle_get_page
> 
> It has PageLRU check but it's racy so it can hold a reference
> of randome page and then verify within zone->lru_lock. If it's
> not LRU page, just put.

Well, I see your concern now - even if a page is not on lru and we
locked all structs pointing to it, it can always get accessed by pfn in
a completely unrelated thread, like in examples you gave above. That's a
fair point.

However, I still think that it's OK in case of pipe buffers. What can
happen if somebody takes a transient reference to a pipe buffer page? At
worst, we'll see page_count > 1 due to temporary ref and abort stealing,
falling back on copying instead. That's OK, because stealing is not
guaranteed. Can a function that takes a transient ref to page by pfn
mistakenly assume that this is a page it's interested in? I don't think
so, because this page has no marks on it except special _mapcount value,
which should only be set on kmemcg pages.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet May 26, 2016, 2:15 p.m. UTC | #7
On Thu, 2016-05-26 at 16:59 +0300, Vladimir Davydov wrote:
> On Thu, May 26, 2016 at 04:04:55PM +0900, Minchan Kim wrote:
> > On Wed, May 25, 2016 at 01:30:11PM +0300, Vladimir Davydov wrote:
> > > On Tue, May 24, 2016 at 01:04:33PM -0700, Eric Dumazet wrote:
> > > > On Tue, 2016-05-24 at 19:13 +0300, Vladimir Davydov wrote:
> > > > > On Tue, May 24, 2016 at 05:59:02AM -0700, Eric Dumazet wrote:
> > > > > ...
> > > > > > > +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
> > > > > > > +			       struct pipe_buffer *buf)
> > > > > > > +{
> > > > > > > +	struct page *page = buf->page;
> > > > > > > +
> > > > > > > +	if (page_count(page) == 1) {
> > > > > > 
> > > > > > This looks racy : some cpu could have temporarily elevated page count.
> > > > > 
> > > > > All pipe operations (pipe_buf_operations->get, ->release, ->steal) are
> > > > > supposed to be called under pipe_lock. So, if we see a pipe_buffer->page
> > > > > with refcount of 1 in ->steal, that means that we are the only its user
> > > > > and it can't be spliced to another pipe.
> > > > > 
> > > > > In fact, I just copied the code from generic_pipe_buf_steal, adding
> > > > > kmemcg related checks along the way, so it should be fine.
> > > > 
> > > > So you guarantee that no other cpu might have done
> > > > get_page_unless_zero() right before this test ?
> > > 
> > > Each pipe_buffer holds a reference to its page. If we find page's
> > > refcount to be 1 here, then it can be referenced only by our
> > > pipe_buffer. And the refcount cannot be increased by a parallel thread,
> > > because we hold pipe_lock, which rules out splice, and otherwise it's
> > > impossible to reach the page as it is not on lru. That said, I think I
> > > guarantee that this should be safe.
> > 
> > I don't know kmemcg internal and pipe stuff so my comment might be
> > totally crap.
> > 
> > No one cannot guarantee any CPU cannot held a reference of a page.
> > Look at get_page_unless_zero usecases.
> > 
> > 1. balloon_page_isolate
> > 
> > It can hold a reference in random page and then verify the page
> > is balloon page. Otherwise, just put.
> > 
> > 2. page_idle_get_page
> > 
> > It has PageLRU check but it's racy so it can hold a reference
> > of randome page and then verify within zone->lru_lock. If it's
> > not LRU page, just put.
> 
> Well, I see your concern now - even if a page is not on lru and we
> locked all structs pointing to it, it can always get accessed by pfn in
> a completely unrelated thread, like in examples you gave above. That's a
> fair point.
> 
> However, I still think that it's OK in case of pipe buffers. What can
> happen if somebody takes a transient reference to a pipe buffer page? At
> worst, we'll see page_count > 1 due to temporary ref and abort stealing,
> falling back on copying instead. That's OK, because stealing is not
> guaranteed. Can a function that takes a transient ref to page by pfn
> mistakenly assume that this is a page it's interested in? I don't think
> so, because this page has no marks on it except special _mapcount value,
> which should only be set on kmemcg pages.

Well, all this information deserve to be in the changelog.

Maybe in 6 months, this will be incredibly useful for bug hunting.

pipes can be used to exchange data (or pages) between processes in
different domains.

If kmemcg is not precise, this could be used by some attackers to force
some processes to consume all their budget and eventually not be able to
allocate new pages.



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/pipe.c b/fs/pipe.c
index 0d3f5165cb0b..4b32928f5426 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@ 
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <linux/memcontrol.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -137,6 +138,22 @@  static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 		put_page(page);
 }
 
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	if (page_count(page) == 1) {
+		if (memcg_kmem_enabled()) {
+			memcg_kmem_uncharge(page, 0);
+			__ClearPageKmemcg(page);
+		}
+		__SetPageLocked(page);
+		return 0;
+	}
+	return 1;
+}
+
 /**
  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
@@ -219,7 +236,7 @@  static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -227,7 +244,7 @@  static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -405,7 +422,7 @@  pipe_write(struct kiocb *iocb, struct iov_iter *from)
 			int copied;
 
 			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
+				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
 				if (unlikely(!page)) {
 					ret = ret ? : -ENOMEM;
 					break;
@@ -611,7 +628,7 @@  struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
-	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 	if (pipe) {
 		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 		struct user_struct *user = get_current_user();
@@ -619,7 +636,9 @@  struct pipe_inode_info *alloc_pipe_info(void)
 		if (!too_many_pipe_buffers_hard(user)) {
 			if (too_many_pipe_buffers_soft(user))
 				pipe_bufs = 1;
-			pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+			pipe->bufs = kcalloc(pipe_bufs,
+					     sizeof(struct pipe_buffer),
+					     GFP_KERNEL_ACCOUNT);
 		}
 
 		if (pipe->bufs) {
@@ -1010,7 +1029,8 @@  static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 	if (nr_pages < pipe->nrbufs)
 		return -EBUSY;
 
-	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
+	bufs = kcalloc(nr_pages, sizeof(*bufs),
+		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (unlikely(!bufs))
 		return -ENOMEM;