diff mbox

[7/9] blkio-cgroup-v9: Page tracking hooks

Message ID 20090721.232316.183036419.ryov@valinux.co.jp (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Ryo Tsuruta July 21, 2009, 2:23 p.m. UTC
This patch contains several hooks that let the blkio-cgroup framework to know
which blkio-cgroup is the owner of a page before starting I/O against the page.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>

---
 fs/buffer.c         |    2 ++
 fs/direct-io.c      |    2 ++
 mm/bounce.c         |    2 ++
 mm/filemap.c        |    2 ++
 mm/memory.c         |    5 +++++
 mm/page-writeback.c |    2 ++
 mm/swap_state.c     |    2 ++
 7 files changed, 17 insertions(+)


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Comments

KAMEZAWA Hiroyuki July 22, 2009, 2:17 a.m. UTC | #1
On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
Ryo Tsuruta <ryov@valinux.co.jp> wrote:

> This patch contains several hooks that let the blkio-cgroup framework to know
> which blkio-cgroup is the owner of a page before starting I/O against the page.

> @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
>  					gfp_mask & GFP_RECLAIM_MASK);
>  	if (error)
>  		goto out;
> +	blkio_cgroup_set_owner(page, current->mm);
>  

This part is doubtful...Is this necessary ? 
I recommend you that the caller should attach owner by itself.


>  	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
>  	if (error == 0) {
> Index: linux-2.6.31-rc3/mm/memory.c
> ===================================================================
> --- linux-2.6.31-rc3.orig/mm/memory.c
> +++ linux-2.6.31-rc3/mm/memory.c
> @@ -51,6 +51,7 @@
>  #include <linux/init.h>
>  #include <linux/writeback.h>
>  #include <linux/memcontrol.h>
> +#include <linux/biotrack.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/kallsyms.h>
>  #include <linux/swapops.h>
> @@ -2115,6 +2116,7 @@ gotten:
>  		 */
>  		ptep_clear_flush_notify(vma, address, page_table);
>  		page_add_new_anon_rmap(new_page, vma, address);
> +		blkio_cgroup_set_owner(new_page, mm);

plz do this in swap-out code.

>  		set_pte_at(mm, address, page_table, entry);
>  		update_mmu_cache(vma, address, entry);
>  		if (old_page) {
> @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct
>  	flush_icache_page(vma, page);
>  	set_pte_at(mm, address, page_table, pte);
>  	page_add_anon_rmap(page, vma, address);
> +	blkio_cgroup_reset_owner(page, mm);

and this.


>  	/* It's better to call commit-charge after rmap is established */
>  	mem_cgroup_commit_charge_swapin(page, ptr);
>  
> @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s
>  		goto release;
>  	inc_mm_counter(mm, anon_rss);
>  	page_add_new_anon_rmap(page, vma, address);
> +	blkio_cgroup_set_owner(page, mm);
>  	set_pte_at(mm, address, page_table, entry);
>  
and this.

>  	/* No need to invalidate - it was non-present before */
> @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct *
>  		if (anon) {
>  			inc_mm_counter(mm, anon_rss);
>  			page_add_new_anon_rmap(page, vma, address);
> +			blkio_cgroup_set_owner(page, mm);
and this.

IMHO, later io for swap-out is caused by the caller of swapout, not page's
owner. plz charge to them or,
  - add special BLOCK CGROUP ID for the kernel's swap out. 

Bye,
-Kame

>  		} else {
>  			inc_mm_counter(mm, file_rss);
>  			page_add_file_rmap(page);
> Index: linux-2.6.31-rc3/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.31-rc3.orig/mm/page-writeback.c
> +++ linux-2.6.31-rc3/mm/page-writeback.c
> @@ -23,6 +23,7 @@
>  #include <linux/init.h>
>  #include <linux/backing-dev.h>
>  #include <linux/task_io_accounting_ops.h>
> +#include <linux/biotrack.h>
>  #include <linux/blkdev.h>
>  #include <linux/mpage.h>
>  #include <linux/rmap.h>
> @@ -1247,6 +1248,7 @@ int __set_page_dirty_nobuffers(struct pa
>  			BUG_ON(mapping2 != mapping);
>  			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
>  			account_page_dirtied(page, mapping);
> +			blkio_cgroup_reset_owner_pagedirty(page, current->mm);
>  			radix_tree_tag_set(&mapping->page_tree,
>  				page_index(page), PAGECACHE_TAG_DIRTY);
>  		}
> Index: linux-2.6.31-rc3/mm/swap_state.c
> ===================================================================
> --- linux-2.6.31-rc3.orig/mm/swap_state.c
> +++ linux-2.6.31-rc3/mm/swap_state.c
> @@ -18,6 +18,7 @@
>  #include <linux/pagevec.h>
>  #include <linux/migrate.h>
>  #include <linux/page_cgroup.h>
> +#include <linux/biotrack.h>
>  
>  #include <asm/pgtable.h>
>  
> @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_e
>  		 */
>  		__set_page_locked(new_page);
>  		SetPageSwapBacked(new_page);
> +		blkio_cgroup_set_owner(new_page, current->mm);
>  		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
>  		if (likely(!err)) {
>  			/*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta July 22, 2009, 9:40 a.m. UTC | #2
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
> Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> 
> > This patch contains several hooks that let the blkio-cgroup framework to know
> > which blkio-cgroup is the owner of a page before starting I/O against the page.
> 
> > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
> >  					gfp_mask & GFP_RECLAIM_MASK);
> >  	if (error)
> >  		goto out;
> > +	blkio_cgroup_set_owner(page, current->mm);
> >  
> 
> This part is doubtful...Is this necessary ? 
> I recommend you that the caller should attach owner by itself.

I think that it is reasonable to add the hook right here rather than
to add many hooks to a variety of places.
 
> >  	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
> >  	if (error == 0) {
> > Index: linux-2.6.31-rc3/mm/memory.c
> > ===================================================================
> > --- linux-2.6.31-rc3.orig/mm/memory.c
> > +++ linux-2.6.31-rc3/mm/memory.c
> > @@ -51,6 +51,7 @@
> >  #include <linux/init.h>
> >  #include <linux/writeback.h>
> >  #include <linux/memcontrol.h>
> > +#include <linux/biotrack.h>
> >  #include <linux/mmu_notifier.h>
> >  #include <linux/kallsyms.h>
> >  #include <linux/swapops.h>
> > @@ -2115,6 +2116,7 @@ gotten:
> >  		 */
> >  		ptep_clear_flush_notify(vma, address, page_table);
> >  		page_add_new_anon_rmap(new_page, vma, address);
> > +		blkio_cgroup_set_owner(new_page, mm);
> 
> plz do this in swap-out code.
> 
> >  		set_pte_at(mm, address, page_table, entry);
> >  		update_mmu_cache(vma, address, entry);
> >  		if (old_page) {
> > @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct
> >  	flush_icache_page(vma, page);
> >  	set_pte_at(mm, address, page_table, pte);
> >  	page_add_anon_rmap(page, vma, address);
> > +	blkio_cgroup_reset_owner(page, mm);
> 
> and this.
> 
> 
> >  	/* It's better to call commit-charge after rmap is established */
> >  	mem_cgroup_commit_charge_swapin(page, ptr);
> >  
> > @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s
> >  		goto release;
> >  	inc_mm_counter(mm, anon_rss);
> >  	page_add_new_anon_rmap(page, vma, address);
> > +	blkio_cgroup_set_owner(page, mm);
> >  	set_pte_at(mm, address, page_table, entry);
> >  
> and this.
> 
> >  	/* No need to invalidate - it was non-present before */
> > @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct *
> >  		if (anon) {
> >  			inc_mm_counter(mm, anon_rss);
> >  			page_add_new_anon_rmap(page, vma, address);
> > +			blkio_cgroup_set_owner(page, mm);
> and this.
> 
> IMHO, later io for swap-out is caused by the caller of swapout, not page's
> owner. plz charge to them or,
>   - add special BLOCK CGROUP ID for the kernel's swap out. 

I think that it is not too bad to charge the owner of a page for
swap-out. From another perspective, it can be considered that swap-out
is caused by a process which uses a large amount of memory.

Thanks,
Ryo Tsuruta

> 
> Bye,
> -Kame
> 
> >  		} else {
> >  			inc_mm_counter(mm, file_rss);
> >  			page_add_file_rmap(page);
> > Index: linux-2.6.31-rc3/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.31-rc3.orig/mm/page-writeback.c
> > +++ linux-2.6.31-rc3/mm/page-writeback.c
> > @@ -23,6 +23,7 @@
> >  #include <linux/init.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/task_io_accounting_ops.h>
> > +#include <linux/biotrack.h>
> >  #include <linux/blkdev.h>
> >  #include <linux/mpage.h>
> >  #include <linux/rmap.h>
> > @@ -1247,6 +1248,7 @@ int __set_page_dirty_nobuffers(struct pa
> >  			BUG_ON(mapping2 != mapping);
> >  			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
> >  			account_page_dirtied(page, mapping);
> > +			blkio_cgroup_reset_owner_pagedirty(page, current->mm);
> >  			radix_tree_tag_set(&mapping->page_tree,
> >  				page_index(page), PAGECACHE_TAG_DIRTY);
> >  		}
> > Index: linux-2.6.31-rc3/mm/swap_state.c
> > ===================================================================
> > --- linux-2.6.31-rc3.orig/mm/swap_state.c
> > +++ linux-2.6.31-rc3/mm/swap_state.c
> > @@ -18,6 +18,7 @@
> >  #include <linux/pagevec.h>
> >  #include <linux/migrate.h>
> >  #include <linux/page_cgroup.h>
> > +#include <linux/biotrack.h>
> >  
> >  #include <asm/pgtable.h>
> >  
> > @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_e
> >  		 */
> >  		__set_page_locked(new_page);
> >  		SetPageSwapBacked(new_page);
> > +		blkio_cgroup_set_owner(new_page, current->mm);
> >  		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
> >  		if (likely(!err)) {
> >  			/*
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> > 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki July 23, 2009, 12:18 a.m. UTC | #3
On Wed, 22 Jul 2009 18:40:55 +0900 (JST)
Ryo Tsuruta <ryov@valinux.co.jp> wrote:

> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
> > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > 
> > > This patch contains several hooks that let the blkio-cgroup framework to know
> > > which blkio-cgroup is the owner of a page before starting I/O against the page.
> > 
> > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
> > >  					gfp_mask & GFP_RECLAIM_MASK);
> > >  	if (error)
> > >  		goto out;
> > > +	blkio_cgroup_set_owner(page, current->mm);
> > >  
> > 
> > This part is doubtful...Is this necessary ? 
> > I recommend you that the caller should attach owner by itself.
> 
> I think that it is reasonable to add the hook right here rather than
> to add many hooks to a variety of places.
> 
Why ? at writing, it's will be overwriten soon, IIUC. Then, this information
is misleading. plz add a hook like this when it means something. In this case,
read/write callers.
IMO, you just increase patch's readbility but decrease easiness of maintaince.


> > IMHO, later io for swap-out is caused by the caller of swapout, not page's
> > owner. plz charge to them or,
> >   - add special BLOCK CGROUP ID for the kernel's swap out. 
> 
> I think that it is not too bad to charge the owner of a page for
> swap-out. From another perspective, it can be considered that swap-out
> is caused by a process which uses a large amount of memory.
> 
No. swap-out is caused by a thread who requests memory even while memory is
in short. IMHO, I/O by memory reqraim should work in priority of memory requester.

Consider following situation.

- A process "A" has big memory. When several threads requests memory, all 
  of them are caught by a blockio cgroup of "A".
- A process "B" has read big file caches. When several threads requests memory, 
  all  of them are caught by a blockio cgroup of "B".

If "A" and "B" 's threshold is small, you'll see big slow down.
But it's not _planned_ behavior in many cases.

If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O
controller to "A" and "B" if it uses much memory. I think the admin can't design
his system. It's nonsense to say "plz set I/O limit propotional to memory usage of
your apps even if it never do I/O in usual."


If this blockio cgroup is introduced, people will see *unexpected* very
terrible slow down and the user will see heartbeat warnings/failover by cluster
management software. Please do I/O at the priority of memory reclaiming requester.


Thanks,
-Kame




--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta July 23, 2009, 6:38 a.m. UTC | #4
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Wed, 22 Jul 2009 18:40:55 +0900 (JST)
> Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> 
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
> > > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > > 
> > > > This patch contains several hooks that let the blkio-cgroup framework to know
> > > > which blkio-cgroup is the owner of a page before starting I/O against the page.
> > > 
> > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
> > > >  					gfp_mask & GFP_RECLAIM_MASK);
> > > >  	if (error)
> > > >  		goto out;
> > > > +	blkio_cgroup_set_owner(page, current->mm);
> > > >  
> > > 
> > > This part is doubtful...Is this necessary ? 
> > > I recommend you that the caller should attach owner by itself.
> > 
> > I think that it is reasonable to add the hook right here rather than
> > to add many hooks to a variety of places.
> > 
> Why ? at writing, it's will be overwriten soon, IIUC. Then, this information
> is misleading. plz add a hook like this when it means something. In this case,
> read/write callers.
> IMO, you just increase patch's readbility but decrease easiness of maintaince.

Even though the owner is overwritten soon at writing, I'm not sure why
inserting the hook here causes the misleading. I think that it is easy
to understand when and where the owner is set by blkio-cgroup, and it
does not decrease maintainability, rather than put many hooks to each
caller.
 
> > > IMHO, later io for swap-out is caused by the caller of swapout, not page's
> > > owner. plz charge to them or,
> > >   - add special BLOCK CGROUP ID for the kernel's swap out. 
> > 
> > I think that it is not too bad to charge the owner of a page for
> > swap-out. From another perspective, it can be considered that swap-out
> > is caused by a process which uses a large amount of memory.
> > 
> No. swap-out is caused by a thread who requests memory even while memory is
> in short. IMHO, I/O by memory reqraim should work in priority of memory requester.


> 
> Consider following situation.
> 
> - A process "A" has big memory. When several threads requests memory, all 
>   of them are caught by a blockio cgroup of "A".
> - A process "B" has read big file caches. When several threads requests memory, 
>   all  of them are caught by a blockio cgroup of "B".
> 
> If "A" and "B" 's threshold is small, you'll see big slow down.
> But it's not _planned_ behavior in many cases.
> 
> If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O
> controller to "A" and "B" if it uses much memory. I think the admin can't design
> his system. It's nonsense to say "plz set I/O limit propotional to memory usage of
> your apps even if it never do I/O in usual."
>
> If this blockio cgroup is introduced, people will see *unexpected* very
> terrible slow down and the user will see heartbeat warnings/failover by cluster
> management software. Please do I/O at the priority of memory reclaiming requester.

dm-ioband gives high priority to I/O for swap-out by checking whether
PG_swapcache flag is set on the I/O page, regardless of the assigned
I/O bandwidth, and the bandwidth consumed for swap-out is charged to
the owner of the pages as a debt.
How about this approach?

Thanks,
Ryo Tsuruta

> 
> 
> Thanks,
> -Kame
> 
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki July 23, 2009, 7:49 a.m. UTC | #5
On Thu, 23 Jul 2009 15:38:43 +0900 (JST)
Ryo Tsuruta <ryov@valinux.co.jp> wrote:

> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Wed, 22 Jul 2009 18:40:55 +0900 (JST)
> > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > 
> > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
> > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > > > 
> > > > > This patch contains several hooks that let the blkio-cgroup framework to know
> > > > > which blkio-cgroup is the owner of a page before starting I/O against the page.
> > > > 
> > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
> > > > >  					gfp_mask & GFP_RECLAIM_MASK);
> > > > >  	if (error)
> > > > >  		goto out;
> > > > > +	blkio_cgroup_set_owner(page, current->mm);
> > > > >  
> > > > 
> > > > This part is doubtful...Is this necessary ? 
> > > > I recommend you that the caller should attach owner by itself.
> > > 
> > > I think that it is reasonable to add the hook right here rather than
> > > to add many hooks to a variety of places.
> > > 
> > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information
> > is misleading. plz add a hook like this when it means something. In this case,
> > read/write callers.
> > IMO, you just increase patch's readbility but decrease easiness of maintaince.
> 
> Even though the owner is overwritten soon at writing, I'm not sure why
> inserting the hook here causes the misleading. I think that it is easy
> to understand when and where the owner is set by blkio-cgroup, and it
> does not decrease maintainability, rather than put many hooks to each
> caller.
>  
Are there _many_ callers ? I don't think so. 
But okay, I don't say strong objections more if other ones say ok.

BTW, a sad information for you.
you can't call lock_page_cgroup() under radix_tree->lock.

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e767e0561d7fd2333df1921f1ab4176211f9036b

plz update.

> > 
> > Consider following situation.
> > 
> > - A process "A" has big memory. When several threads requests memory, all 
> >   of them are caught by a blockio cgroup of "A".
> > - A process "B" has read big file caches. When several threads requests memory, 
> >   all  of them are caught by a blockio cgroup of "B".
> > 
> > If "A" and "B" 's threshold is small, you'll see big slow down.
> > But it's not _planned_ behavior in many cases.
> > 
> > If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O
> > controller to "A" and "B" if it uses much memory. I think the admin can't design
> > his system. It's nonsense to say "plz set I/O limit propotional to memory usage of
> > your apps even if it never do I/O in usual."
> >
> > If this blockio cgroup is introduced, people will see *unexpected* very
> > terrible slow down and the user will see heartbeat warnings/failover by cluster
> > management software. Please do I/O at the priority of memory reclaiming requester.
> 
> dm-ioband gives high priority to I/O for swap-out by checking whether
> PG_swapcache flag is set on the I/O page, regardless of the assigned
> I/O bandwidth, and the bandwidth consumed for swap-out is charged to
> the owner of the pages as a debt.
> How about this approach?

I don't think it's reasonable. Why I/O device, scheduler should know about
such mm-related information ? I think layering is wrong.
And your approatch cannot be a workaround.

In follwing _typical_ case,
 
  - A process does small logging to /var/log/mylog, once in a sec.
    but it uses some amount of cold memory or shmem.

This process's logging will be delayed _unexpectedly_ by some buggy process
which does memory leak.


Thanks,
-Kame

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta July 23, 2009, 10:02 a.m. UTC | #6
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 23 Jul 2009 15:38:43 +0900 (JST)
> Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> 
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Wed, 22 Jul 2009 18:40:55 +0900 (JST)
> > > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > > 
> > > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST)
> > > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote:
> > > > > 
> > > > > > This patch contains several hooks that let the blkio-cgroup framework to know
> > > > > > which blkio-cgroup is the owner of a page before starting I/O against the page.
> > > > > 
> > > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page
> > > > > >  					gfp_mask & GFP_RECLAIM_MASK);
> > > > > >  	if (error)
> > > > > >  		goto out;
> > > > > > +	blkio_cgroup_set_owner(page, current->mm);
> > > > > >  
> > > > > 
> > > > > This part is doubtful...Is this necessary ? 
> > > > > I recommend you that the caller should attach owner by itself.
> > > > 
> > > > I think that it is reasonable to add the hook right here rather than
> > > > to add many hooks to a variety of places.
> > > > 
> > > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information
> > > is misleading. plz add a hook like this when it means something. In this case,
> > > read/write callers.
> > > IMO, you just increase patch's readbility but decrease easiness of maintaince.
> > 
> > Even though the owner is overwritten soon at writing, I'm not sure why
> > inserting the hook here causes the misleading. I think that it is easy
> > to understand when and where the owner is set by blkio-cgroup, and it
> > does not decrease maintainability, rather than put many hooks to each
> > caller.
> >  
> Are there _many_ callers ? I don't think so. 
> But okay, I don't say strong objections more if other ones say ok.
> 
> BTW, a sad information for you.
> you can't call lock_page_cgroup() under radix_tree->lock.
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e767e0561d7fd2333df1921f1ab4176211f9036b
> 
> plz update.

Thank you for letting me know. I'll fix it.

> 
> > > 
> > > Consider following situation.
> > > 
> > > - A process "A" has big memory. When several threads requests memory, all 
> > >   of them are caught by a blockio cgroup of "A".
> > > - A process "B" has read big file caches. When several threads requests memory, 
> > >   all  of them are caught by a blockio cgroup of "B".
> > > 
> > > If "A" and "B" 's threshold is small, you'll see big slow down.
> > > But it's not _planned_ behavior in many cases.
> > > 
> > > If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O
> > > controller to "A" and "B" if it uses much memory. I think the admin can't design
> > > his system. It's nonsense to say "plz set I/O limit propotional to memory usage of
> > > your apps even if it never do I/O in usual."
> > >
> > > If this blockio cgroup is introduced, people will see *unexpected* very
> > > terrible slow down and the user will see heartbeat warnings/failover by cluster
> > > management software. Please do I/O at the priority of memory reclaiming requester.
> > 
> > dm-ioband gives high priority to I/O for swap-out by checking whether
> > PG_swapcache flag is set on the I/O page, regardless of the assigned
> > I/O bandwidth, and the bandwidth consumed for swap-out is charged to
> > the owner of the pages as a debt.
> > How about this approach?
> 
> I don't think it's reasonable. Why I/O device, scheduler should know about
> such mm-related information ? I think layering is wrong.

I think that urgent I/O requests such as swap-out should be notified
by setting a special flag in the struct bio, but there is no such
mechanism at this time. That is why dm-ioband uses this approach.

> And your approatch cannot be a workaround.
> 
> In follwing _typical_ case,
>  
>   - A process does small logging to /var/log/mylog, once in a sec.
>     but it uses some amount of cold memory or shmem.
> 
> This process's logging will be delayed _unexpectedly_ by some buggy process
> which does memory leak.

Do you mean that the delay in logging is caused since the small process
is swapped out unexpectedly by the buggy processes?
How about using memory cgroup to prevent the small process from swap-out?
I would appreciate it if you could tell me more about this.

Thanks,
Ryo Tsuruta

> 
> Thanks,
> -Kame

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki July 23, 2009, 12:02 p.m. UTC | #7
Ryo Tsuruta wrote:
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

>> > dm-ioband gives high priority to I/O for swap-out by checking whether
>> > PG_swapcache flag is set on the I/O page, regardless of the assigned
>> > I/O bandwidth, and the bandwidth consumed for swap-out is charged to
>> > the owner of the pages as a debt.
>> > How about this approach?
>>
>> I don't think it's reasonable. Why I/O device, scheduler should know
>> about
>> such mm-related information ? I think layering is wrong.
>
> I think that urgent I/O requests such as swap-out should be notified
> by setting a special flag in the struct bio, but there is no such
> mechanism at this time. That is why dm-ioband uses this approach.
>
>> And your approatch cannot be a workaround.
>>
>> In follwing _typical_ case,
>>
>>   - A process does small logging to /var/log/mylog, once in a sec.
>>     but it uses some amount of cold memory or shmem.
>>
>> This process's logging will be delayed _unexpectedly_ by some buggy
>> process
>> which does memory leak.
>
> Do you mean that the delay in logging is caused since the small process
> is swapped out unexpectedly by the buggy processes?
I don't write "small process", "small logging".
Buggy process does swap-out and cosumes someone else's bandwidth, then,
loggind will be delayed. Important here is throttle bandwidth consumed by
buggy prorcess, not other's.

> How about using memory cgroup to prevent the small process from swap-out?
It never be help if memcg is not configured.

My point is "don't allow anyone to use bandwidth of others."
Considering job isolation, a thread who requests swap-out should be charged
against bandwidth.

Thanks,
-Kame



--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta July 24, 2009, 5:44 a.m. UTC | #8
"KAMEZAWA Hiroyuki" <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Ryo Tsuruta wrote:
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> >> > dm-ioband gives high priority to I/O for swap-out by checking whethe=
> r
> >> > PG_swapcache flag is set on the I/O page, regardless of the assigned
> >> > I/O bandwidth, and the bandwidth consumed for swap-out is charged to
> >> > the owner of the pages as a debt.
> >> > How about this approach?
> >>
> >> I don't think it's reasonable. Why I/O device, scheduler should know
> >> about
> >> such mm-related information ? I think layering is wrong.
> >
> > I think that urgent I/O requests such as swap-out should be notified
> > by setting a special flag in the struct bio, but there is no such
> > mechanism at this time. That is why dm-ioband uses this approach.
> >
> >> And your approatch cannot be a workaround.
> >>
> >> In follwing _typical_ case,
> >>
> >>   - A process does small logging to /var/log/mylog, once in a sec.
> >>     but it uses some amount of cold memory or shmem.
> >>
> >> This process's logging will be delayed _unexpectedly_ by some buggy
> >> process
> >> which does memory leak.
> >
> > Do you mean that the delay in logging is caused since the small process
> > is swapped out unexpectedly by the buggy processes?
> I don't write "small process", "small logging".
> Buggy process does swap-out and cosumes someone else's bandwidth, then,
> loggind will be delayed. Important here is throttle bandwidth consumed by
> buggy prorcess, not other's.

Thank you for explaining it.

> > How about using memory cgroup to prevent the small process from swap-ou=
> t?
> It never be help if memcg is not configured.

blkio-cgroup is recommended to use with memcg. I think that it can be
a good solution to resolve such problem.

> My point is "don't allow anyone to use bandwidth of others."
> Considering job isolation, a thread who requests swap-out should be charg=
> ed
> against bandwidth.

>From another perspective, the swap-out is caused since the buggy
process uses a large amount of memory, so it can be considered as 
the bandwidth of logging process is used due to the buggy process.

Please consider the following case. If a thread who requests swap-out
is charged, the thread is charged other threads' I/O.

   (1)                          --------      (2)
   Process A                   |        |     Process B
   mmaps a large area in   --> | memory | <-- tries to allocate a page.
   the memory and writes       |        |
   data to there.               --------     (3)
                                   |         To get a free page,
                                   |         the data written by Proc.A
                                   |         is written out to the disk.
                                   V         The I/O is done by using
                                ---------    Proc.B's bandwidth.
                               |  disk   |   
                                ---------

Thus I think that page owners should be charged against bandwidth.

Thanks,
Ryo Tsuruta

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki July 24, 2009, 6:19 a.m. UTC | #9
On Fri, 24 Jul 2009 14:44:16 +0900 (JST)
Ryo Tsuruta <ryov@valinux.co.jp> wrote:
 good solution to resolve such problem.
> 
> > My point is "don't allow anyone to use bandwidth of others."
> > Considering job isolation, a thread who requests swap-out should be charg=
> > ed
> > against bandwidth.
> 
> From another perspective, the swap-out is caused since the buggy
> process uses a large amount of memory, so it can be considered as 
> the bandwidth of logging process is used due to the buggy process.
> 
> Please consider the following case. If a thread who requests swap-out
> is charged, the thread is charged other threads' I/O.
> 
>    (1)                          --------      (2)
>    Process A                   |        |     Process B
>    mmaps a large area in   --> | memory | <-- tries to allocate a page.
>    the memory and writes       |        |
>    data to there.               --------     (3)
>                                    |         To get a free page,
>                                    |         the data written by Proc.A
>                                    |         is written out to the disk.
>                                    V         The I/O is done by using
>                                 ---------    Proc.B's bandwidth.
>                                |  disk   |   
>                                 ---------
> 
> Thus I think that page owners should be charged against bandwidth.
> 
Ok, no good way. yours is wrong, mine is wrong, too.
plz find 3rd way, reasonable.

Below is brief thinking.

"Why process A should be charged to I/O when it just maps anon memory ?"
I can't answer this.

Even in yorr case, Process B requests memory and get penalty. It's
very natural, I think.

In usual case, 
 - if process A maps ANON, there will be no I/O.
 - if process A maps FILE, it will be charged to process A.
ok ?

Under memory pressure,
 - if process A maps ANON, swap I/O should be charged to process B.
 - if process A maps FILE, I/O should be charged to process A.
maybe. 


Anyway, there will be ineraction with dirty_ratio of memcg (not implemeted yet)
and _Owner should be charged_ issue will be handled in this dirty_ratio layer.
More consideration is necessary, I think.


Bye,
-Kame


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta July 24, 2009, 8:48 a.m. UTC | #10
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 24 Jul 2009 14:44:16 +0900 (JST)
> Ryo Tsuruta <ryov@valinux.co.jp> wrote:
>  good solution to resolve such problem.
> > 
> > > My point is "don't allow anyone to use bandwidth of others."
> > > Considering job isolation, a thread who requests swap-out should be charg=
> > > ed
> > > against bandwidth.
> > 
> > From another perspective, the swap-out is caused since the buggy
> > process uses a large amount of memory, so it can be considered as 
> > the bandwidth of logging process is used due to the buggy process.
> > 
> > Please consider the following case. If a thread who requests swap-out
> > is charged, the thread is charged other threads' I/O.
> > 
> >    (1)                          --------      (2)
> >    Process A                   |        |     Process B
> >    mmaps a large area in   --> | memory | <-- tries to allocate a page.
> >    the memory and writes       |        |
> >    data to there.               --------     (3)
> >                                    |         To get a free page,
> >                                    |         the data written by Proc.A
> >                                    |         is written out to the disk.
> >                                    V         The I/O is done by using
> >                                 ---------    Proc.B's bandwidth.
> >                                |  disk   |   
> >                                 ---------
> > 
> > Thus I think that page owners should be charged against bandwidth.
> > 
> Ok, no good way. yours is wrong, mine is wrong, too.
> plz find 3rd way, reasonable.
> 
> Below is brief thinking.
> 
> "Why process A should be charged to I/O when it just maps anon memory ?"
> I can't answer this.
> 
> Even in yorr case, Process B requests memory and get penalty. It's
> very natural, I think.
> 
> In usual case, 
>  - if process A maps ANON, there will be no I/O.
>  - if process A maps FILE, it will be charged to process A.
> ok ?
> 
> Under memory pressure,
>  - if process A maps ANON, swap I/O should be charged to process B.
>  - if process A maps FILE, I/O should be charged to process A.
> maybe. 

I think that even process A maps ANON, it should be charged to process A
because the memory pressure is caused by process A. It seems natual
for me that a process which consumes more resources is more likely to
get penalty.

> Anyway, there will be ineraction with dirty_ratio of memcg (not implemeted yet)
> and _Owner should be charged_ issue will be handled in this dirty_ratio layer.
> More consideration is necessary, I think.

I'll keep thinking how it should be done.

Thanks,
Ryo Tsuruta

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox

Patch

Index: linux-2.6.31-rc3/fs/buffer.c
===================================================================
--- linux-2.6.31-rc3.orig/fs/buffer.c
+++ linux-2.6.31-rc3/fs/buffer.c
@@ -36,6 +36,7 @@ 
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
+#include <linux/biotrack.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
@@ -668,6 +669,7 @@  static void __set_page_dirty(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
+		blkio_cgroup_reset_owner_pagedirty(page, current->mm);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
Index: linux-2.6.31-rc3/fs/direct-io.c
===================================================================
--- linux-2.6.31-rc3.orig/fs/direct-io.c
+++ linux-2.6.31-rc3/fs/direct-io.c
@@ -33,6 +33,7 @@ 
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/biotrack.h>
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <asm/atomic.h>
@@ -797,6 +798,7 @@  static int do_direct_IO(struct dio *dio)
 			ret = PTR_ERR(page);
 			goto out;
 		}
+		blkio_cgroup_reset_owner(page, current->mm);
 
 		while (block_in_page < blocks_per_page) {
 			unsigned offset_in_page = block_in_page << blkbits;
Index: linux-2.6.31-rc3/mm/bounce.c
===================================================================
--- linux-2.6.31-rc3.orig/mm/bounce.c
+++ linux-2.6.31-rc3/mm/bounce.c
@@ -13,6 +13,7 @@ 
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/biotrack.h>
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
@@ -210,6 +211,7 @@  static void __blk_queue_bounce(struct re
 		to->bv_len = from->bv_len;
 		to->bv_offset = from->bv_offset;
 		inc_zone_page_state(to->bv_page, NR_BOUNCE);
+		blkio_cgroup_copy_owner(to->bv_page, page);
 
 		if (rw == WRITE) {
 			char *vto, *vfrom;
Index: linux-2.6.31-rc3/mm/filemap.c
===================================================================
--- linux-2.6.31-rc3.orig/mm/filemap.c
+++ linux-2.6.31-rc3/mm/filemap.c
@@ -33,6 +33,7 @@ 
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
+#include <linux/biotrack.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 
@@ -464,6 +465,7 @@  int add_to_page_cache_locked(struct page
 					gfp_mask & GFP_RECLAIM_MASK);
 	if (error)
 		goto out;
+	blkio_cgroup_set_owner(page, current->mm);
 
 	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
Index: linux-2.6.31-rc3/mm/memory.c
===================================================================
--- linux-2.6.31-rc3.orig/mm/memory.c
+++ linux-2.6.31-rc3/mm/memory.c
@@ -51,6 +51,7 @@ 
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/biotrack.h>
 #include <linux/mmu_notifier.h>
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
@@ -2115,6 +2116,7 @@  gotten:
 		 */
 		ptep_clear_flush_notify(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
+		blkio_cgroup_set_owner(new_page, mm);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		if (old_page) {
@@ -2580,6 +2582,7 @@  static int do_swap_page(struct mm_struct
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
+	blkio_cgroup_reset_owner(page, mm);
 	/* It's better to call commit-charge after rmap is established */
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
@@ -2644,6 +2647,7 @@  static int do_anonymous_page(struct mm_s
 		goto release;
 	inc_mm_counter(mm, anon_rss);
 	page_add_new_anon_rmap(page, vma, address);
+	blkio_cgroup_set_owner(page, mm);
 	set_pte_at(mm, address, page_table, entry);
 
 	/* No need to invalidate - it was non-present before */
@@ -2791,6 +2795,7 @@  static int __do_fault(struct mm_struct *
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			page_add_new_anon_rmap(page, vma, address);
+			blkio_cgroup_set_owner(page, mm);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
Index: linux-2.6.31-rc3/mm/page-writeback.c
===================================================================
--- linux-2.6.31-rc3.orig/mm/page-writeback.c
+++ linux-2.6.31-rc3/mm/page-writeback.c
@@ -23,6 +23,7 @@ 
 #include <linux/init.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/biotrack.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/rmap.h>
@@ -1247,6 +1248,7 @@  int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
 			account_page_dirtied(page, mapping);
+			blkio_cgroup_reset_owner_pagedirty(page, current->mm);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
Index: linux-2.6.31-rc3/mm/swap_state.c
===================================================================
--- linux-2.6.31-rc3.orig/mm/swap_state.c
+++ linux-2.6.31-rc3/mm/swap_state.c
@@ -18,6 +18,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
+#include <linux/biotrack.h>
 
 #include <asm/pgtable.h>
 
@@ -307,6 +308,7 @@  struct page *read_swap_cache_async(swp_e
 		 */
 		__set_page_locked(new_page);
 		SetPageSwapBacked(new_page);
+		blkio_cgroup_set_owner(new_page, current->mm);
 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
 		if (likely(!err)) {
 			/*