Message ID | 20090721.232316.183036419.ryov@valinux.co.jp (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Tue, 21 Jul 2009 23:23:16 +0900 (JST) Ryo Tsuruta <ryov@valinux.co.jp> wrote: > This patch contains several hooks that let the blkio-cgroup framework to know > which blkio-cgroup is the owner of a page before starting I/O against the page. > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > gfp_mask & GFP_RECLAIM_MASK); > if (error) > goto out; > + blkio_cgroup_set_owner(page, current->mm); > This part is doubtful...Is this necessary ? I recommend you that the caller should attach owner by itself. > error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); > if (error == 0) { > Index: linux-2.6.31-rc3/mm/memory.c > =================================================================== > --- linux-2.6.31-rc3.orig/mm/memory.c > +++ linux-2.6.31-rc3/mm/memory.c > @@ -51,6 +51,7 @@ > #include <linux/init.h> > #include <linux/writeback.h> > #include <linux/memcontrol.h> > +#include <linux/biotrack.h> > #include <linux/mmu_notifier.h> > #include <linux/kallsyms.h> > #include <linux/swapops.h> > @@ -2115,6 +2116,7 @@ gotten: > */ > ptep_clear_flush_notify(vma, address, page_table); > page_add_new_anon_rmap(new_page, vma, address); > + blkio_cgroup_set_owner(new_page, mm); plz do this in swap-out code. > set_pte_at(mm, address, page_table, entry); > update_mmu_cache(vma, address, entry); > if (old_page) { > @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct > flush_icache_page(vma, page); > set_pte_at(mm, address, page_table, pte); > page_add_anon_rmap(page, vma, address); > + blkio_cgroup_reset_owner(page, mm); and this. > /* It's better to call commit-charge after rmap is established */ > mem_cgroup_commit_charge_swapin(page, ptr); > > @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s > goto release; > inc_mm_counter(mm, anon_rss); > page_add_new_anon_rmap(page, vma, address); > + blkio_cgroup_set_owner(page, mm); > set_pte_at(mm, address, page_table, entry); > and this. > /* No need to invalidate - it was non-present before */ > @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct * > if (anon) { > inc_mm_counter(mm, anon_rss); > page_add_new_anon_rmap(page, vma, address); > + blkio_cgroup_set_owner(page, mm); and this. IMHO, later io for swap-out is caused by the caller of swapout, not page's owner. plz charge to them or, - add special BLOCK CGROUP ID for the kernel's swap out. Bye, -Kame > } else { > inc_mm_counter(mm, file_rss); > page_add_file_rmap(page); > Index: linux-2.6.31-rc3/mm/page-writeback.c > =================================================================== > --- linux-2.6.31-rc3.orig/mm/page-writeback.c > +++ linux-2.6.31-rc3/mm/page-writeback.c > @@ -23,6 +23,7 @@ > #include <linux/init.h> > #include <linux/backing-dev.h> > #include <linux/task_io_accounting_ops.h> > +#include <linux/biotrack.h> > #include <linux/blkdev.h> > #include <linux/mpage.h> > #include <linux/rmap.h> > @@ -1247,6 +1248,7 @@ int __set_page_dirty_nobuffers(struct pa > BUG_ON(mapping2 != mapping); > WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); > account_page_dirtied(page, mapping); > + blkio_cgroup_reset_owner_pagedirty(page, current->mm); > radix_tree_tag_set(&mapping->page_tree, > page_index(page), PAGECACHE_TAG_DIRTY); > } > Index: linux-2.6.31-rc3/mm/swap_state.c > =================================================================== > --- linux-2.6.31-rc3.orig/mm/swap_state.c > +++ linux-2.6.31-rc3/mm/swap_state.c > @@ -18,6 +18,7 @@ > #include <linux/pagevec.h> > #include <linux/migrate.h> > #include <linux/page_cgroup.h> > +#include <linux/biotrack.h> > > #include <asm/pgtable.h> > > @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_e > */ > __set_page_locked(new_page); > SetPageSwapBacked(new_page); > + blkio_cgroup_set_owner(new_page, current->mm); > err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); > if (likely(!err)) { > /* > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > On Tue, 21 Jul 2009 23:23:16 +0900 (JST) > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > This patch contains several hooks that let the blkio-cgroup framework to know > > which blkio-cgroup is the owner of a page before starting I/O against the page. > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > > gfp_mask & GFP_RECLAIM_MASK); > > if (error) > > goto out; > > + blkio_cgroup_set_owner(page, current->mm); > > > > This part is doubtful...Is this necessary ? > I recommend you that the caller should attach owner by itself. I think that it is reasonable to add the hook right here rather than to add many hooks to a variety of places. > > error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); > > if (error == 0) { > > Index: linux-2.6.31-rc3/mm/memory.c > > =================================================================== > > --- linux-2.6.31-rc3.orig/mm/memory.c > > +++ linux-2.6.31-rc3/mm/memory.c > > @@ -51,6 +51,7 @@ > > #include <linux/init.h> > > #include <linux/writeback.h> > > #include <linux/memcontrol.h> > > +#include <linux/biotrack.h> > > #include <linux/mmu_notifier.h> > > #include <linux/kallsyms.h> > > #include <linux/swapops.h> > > @@ -2115,6 +2116,7 @@ gotten: > > */ > > ptep_clear_flush_notify(vma, address, page_table); > > page_add_new_anon_rmap(new_page, vma, address); > > + blkio_cgroup_set_owner(new_page, mm); > > plz do this in swap-out code. > > > set_pte_at(mm, address, page_table, entry); > > update_mmu_cache(vma, address, entry); > > if (old_page) { > > @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct > > flush_icache_page(vma, page); > > set_pte_at(mm, address, page_table, pte); > > page_add_anon_rmap(page, vma, address); > > + blkio_cgroup_reset_owner(page, mm); > > and this. > > > > /* It's better to call commit-charge after rmap is established */ > > mem_cgroup_commit_charge_swapin(page, ptr); > > > > @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s > > goto release; > > inc_mm_counter(mm, anon_rss); > > page_add_new_anon_rmap(page, vma, address); > > + blkio_cgroup_set_owner(page, mm); > > set_pte_at(mm, address, page_table, entry); > > > and this. > > > /* No need to invalidate - it was non-present before */ > > @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct * > > if (anon) { > > inc_mm_counter(mm, anon_rss); > > page_add_new_anon_rmap(page, vma, address); > > + blkio_cgroup_set_owner(page, mm); > and this. > > IMHO, later io for swap-out is caused by the caller of swapout, not page's > owner. plz charge to them or, > - add special BLOCK CGROUP ID for the kernel's swap out. I think that it is not too bad to charge the owner of a page for swap-out. From another perspective, it can be considered that swap-out is caused by a process which uses a large amount of memory. Thanks, Ryo Tsuruta > > Bye, > -Kame > > > } else { > > inc_mm_counter(mm, file_rss); > > page_add_file_rmap(page); > > Index: linux-2.6.31-rc3/mm/page-writeback.c > > =================================================================== > > --- linux-2.6.31-rc3.orig/mm/page-writeback.c > > +++ linux-2.6.31-rc3/mm/page-writeback.c > > @@ -23,6 +23,7 @@ > > #include <linux/init.h> > > #include <linux/backing-dev.h> > > #include <linux/task_io_accounting_ops.h> > > +#include <linux/biotrack.h> > > #include <linux/blkdev.h> > > #include <linux/mpage.h> > > #include <linux/rmap.h> > > @@ -1247,6 +1248,7 @@ int __set_page_dirty_nobuffers(struct pa > > BUG_ON(mapping2 != mapping); > > WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); > > account_page_dirtied(page, mapping); > > + blkio_cgroup_reset_owner_pagedirty(page, current->mm); > > radix_tree_tag_set(&mapping->page_tree, > > page_index(page), PAGECACHE_TAG_DIRTY); > > } > > Index: linux-2.6.31-rc3/mm/swap_state.c > > =================================================================== > > --- linux-2.6.31-rc3.orig/mm/swap_state.c > > +++ linux-2.6.31-rc3/mm/swap_state.c > > @@ -18,6 +18,7 @@ > > #include <linux/pagevec.h> > > #include <linux/migrate.h> > > #include <linux/page_cgroup.h> > > +#include <linux/biotrack.h> > > > > #include <asm/pgtable.h> > > > > @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_e > > */ > > __set_page_locked(new_page); > > SetPageSwapBacked(new_page); > > + blkio_cgroup_set_owner(new_page, current->mm); > > err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); > > if (likely(!err)) { > > /* > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > Please read the FAQ at http://www.tux.org/lkml/ > > -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
On Wed, 22 Jul 2009 18:40:55 +0900 (JST) Ryo Tsuruta <ryov@valinux.co.jp> wrote: > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST) > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > This patch contains several hooks that let the blkio-cgroup framework to know > > > which blkio-cgroup is the owner of a page before starting I/O against the page. > > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > > > gfp_mask & GFP_RECLAIM_MASK); > > > if (error) > > > goto out; > > > + blkio_cgroup_set_owner(page, current->mm); > > > > > > > This part is doubtful...Is this necessary ? > > I recommend you that the caller should attach owner by itself. > > I think that it is reasonable to add the hook right here rather than > to add many hooks to a variety of places. > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information is misleading. plz add a hook like this when it means something. In this case, read/write callers. IMO, you just increase patch's readbility but decrease easiness of maintaince. > > IMHO, later io for swap-out is caused by the caller of swapout, not page's > > owner. plz charge to them or, > > - add special BLOCK CGROUP ID for the kernel's swap out. > > I think that it is not too bad to charge the owner of a page for > swap-out. From another perspective, it can be considered that swap-out > is caused by a process which uses a large amount of memory. > No. swap-out is caused by a thread who requests memory even while memory is in short. IMHO, I/O by memory reqraim should work in priority of memory requester. Consider following situation. - A process "A" has big memory. When several threads requests memory, all of them are caught by a blockio cgroup of "A". - A process "B" has read big file caches. When several threads requests memory, all of them are caught by a blockio cgroup of "B". If "A" and "B" 's threshold is small, you'll see big slow down. But it's not _planned_ behavior in many cases. If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O controller to "A" and "B" if it uses much memory. I think the admin can't design his system. It's nonsense to say "plz set I/O limit propotional to memory usage of your apps even if it never do I/O in usual." If this blockio cgroup is introduced, people will see *unexpected* very terrible slow down and the user will see heartbeat warnings/failover by cluster management software. Please do I/O at the priority of memory reclaiming requester. Thanks, -Kame -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > On Wed, 22 Jul 2009 18:40:55 +0900 (JST) > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST) > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > > > This patch contains several hooks that let the blkio-cgroup framework to know > > > > which blkio-cgroup is the owner of a page before starting I/O against the page. > > > > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > > > > gfp_mask & GFP_RECLAIM_MASK); > > > > if (error) > > > > goto out; > > > > + blkio_cgroup_set_owner(page, current->mm); > > > > > > > > > > This part is doubtful...Is this necessary ? > > > I recommend you that the caller should attach owner by itself. > > > > I think that it is reasonable to add the hook right here rather than > > to add many hooks to a variety of places. > > > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information > is misleading. plz add a hook like this when it means something. In this case, > read/write callers. > IMO, you just increase patch's readbility but decrease easiness of maintaince. Even though the owner is overwritten soon at writing, I'm not sure why inserting the hook here causes the misleading. I think that it is easy to understand when and where the owner is set by blkio-cgroup, and it does not decrease maintainability, rather than put many hooks to each caller. > > > IMHO, later io for swap-out is caused by the caller of swapout, not page's > > > owner. plz charge to them or, > > > - add special BLOCK CGROUP ID for the kernel's swap out. > > > > I think that it is not too bad to charge the owner of a page for > > swap-out. From another perspective, it can be considered that swap-out > > is caused by a process which uses a large amount of memory. > > > No. swap-out is caused by a thread who requests memory even while memory is > in short. IMHO, I/O by memory reqraim should work in priority of memory requester. > > Consider following situation. > > - A process "A" has big memory. When several threads requests memory, all > of them are caught by a blockio cgroup of "A". > - A process "B" has read big file caches. When several threads requests memory, > all of them are caught by a blockio cgroup of "B". > > If "A" and "B" 's threshold is small, you'll see big slow down. > But it's not _planned_ behavior in many cases. > > If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O > controller to "A" and "B" if it uses much memory. I think the admin can't design > his system. It's nonsense to say "plz set I/O limit propotional to memory usage of > your apps even if it never do I/O in usual." > > If this blockio cgroup is introduced, people will see *unexpected* very > terrible slow down and the user will see heartbeat warnings/failover by cluster > management software. Please do I/O at the priority of memory reclaiming requester. dm-ioband gives high priority to I/O for swap-out by checking whether PG_swapcache flag is set on the I/O page, regardless of the assigned I/O bandwidth, and the bandwidth consumed for swap-out is charged to the owner of the pages as a debt. How about this approach? Thanks, Ryo Tsuruta > > > Thanks, > -Kame > > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
On Thu, 23 Jul 2009 15:38:43 +0900 (JST) Ryo Tsuruta <ryov@valinux.co.jp> wrote: > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > On Wed, 22 Jul 2009 18:40:55 +0900 (JST) > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST) > > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > > > > > This patch contains several hooks that let the blkio-cgroup framework to know > > > > > which blkio-cgroup is the owner of a page before starting I/O against the page. > > > > > > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > > > > > gfp_mask & GFP_RECLAIM_MASK); > > > > > if (error) > > > > > goto out; > > > > > + blkio_cgroup_set_owner(page, current->mm); > > > > > > > > > > > > > This part is doubtful...Is this necessary ? > > > > I recommend you that the caller should attach owner by itself. > > > > > > I think that it is reasonable to add the hook right here rather than > > > to add many hooks to a variety of places. > > > > > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information > > is misleading. plz add a hook like this when it means something. In this case, > > read/write callers. > > IMO, you just increase patch's readbility but decrease easiness of maintaince. > > Even though the owner is overwritten soon at writing, I'm not sure why > inserting the hook here causes the misleading. I think that it is easy > to understand when and where the owner is set by blkio-cgroup, and it > does not decrease maintainability, rather than put many hooks to each > caller. > Are there _many_ callers ? I don't think so. But okay, I don't say strong objections more if other ones say ok. BTW, a sad information for you. you can't call lock_page_cgroup() under radix_tree->lock. http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e767e0561d7fd2333df1921f1ab4176211f9036b plz update. > > > > Consider following situation. > > > > - A process "A" has big memory. When several threads requests memory, all > > of them are caught by a blockio cgroup of "A". > > - A process "B" has read big file caches. When several threads requests memory, > > all of them are caught by a blockio cgroup of "B". > > > > If "A" and "B" 's threshold is small, you'll see big slow down. > > But it's not _planned_ behavior in many cases. > > > > If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O > > controller to "A" and "B" if it uses much memory. I think the admin can't design > > his system. It's nonsense to say "plz set I/O limit propotional to memory usage of > > your apps even if it never do I/O in usual." > > > > If this blockio cgroup is introduced, people will see *unexpected* very > > terrible slow down and the user will see heartbeat warnings/failover by cluster > > management software. Please do I/O at the priority of memory reclaiming requester. > > dm-ioband gives high priority to I/O for swap-out by checking whether > PG_swapcache flag is set on the I/O page, regardless of the assigned > I/O bandwidth, and the bandwidth consumed for swap-out is charged to > the owner of the pages as a debt. > How about this approach? I don't think it's reasonable. Why I/O device, scheduler should know about such mm-related information ? I think layering is wrong. And your approatch cannot be a workaround. In follwing _typical_ case, - A process does small logging to /var/log/mylog, once in a sec. but it uses some amount of cold memory or shmem. This process's logging will be delayed _unexpectedly_ by some buggy process which does memory leak. Thanks, -Kame -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > On Thu, 23 Jul 2009 15:38:43 +0900 (JST) > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > > On Wed, 22 Jul 2009 18:40:55 +0900 (JST) > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > > > > On Tue, 21 Jul 2009 23:23:16 +0900 (JST) > > > > > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > > > > > > > > > > > This patch contains several hooks that let the blkio-cgroup framework to know > > > > > > which blkio-cgroup is the owner of a page before starting I/O against the page. > > > > > > > > > > > @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page > > > > > > gfp_mask & GFP_RECLAIM_MASK); > > > > > > if (error) > > > > > > goto out; > > > > > > + blkio_cgroup_set_owner(page, current->mm); > > > > > > > > > > > > > > > > This part is doubtful...Is this necessary ? > > > > > I recommend you that the caller should attach owner by itself. > > > > > > > > I think that it is reasonable to add the hook right here rather than > > > > to add many hooks to a variety of places. > > > > > > > Why ? at writing, it's will be overwriten soon, IIUC. Then, this information > > > is misleading. plz add a hook like this when it means something. In this case, > > > read/write callers. > > > IMO, you just increase patch's readbility but decrease easiness of maintaince. > > > > Even though the owner is overwritten soon at writing, I'm not sure why > > inserting the hook here causes the misleading. I think that it is easy > > to understand when and where the owner is set by blkio-cgroup, and it > > does not decrease maintainability, rather than put many hooks to each > > caller. > > > Are there _many_ callers ? I don't think so. > But okay, I don't say strong objections more if other ones say ok. > > BTW, a sad information for you. > you can't call lock_page_cgroup() under radix_tree->lock. > > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e767e0561d7fd2333df1921f1ab4176211f9036b > > plz update. Thank you for letting me know. I'll fix it. > > > > > > > Consider following situation. > > > > > > - A process "A" has big memory. When several threads requests memory, all > > > of them are caught by a blockio cgroup of "A". > > > - A process "B" has read big file caches. When several threads requests memory, > > > all of them are caught by a blockio cgroup of "B". > > > > > > If "A" and "B" 's threshold is small, you'll see big slow down. > > > But it's not _planned_ behavior in many cases. > > > > > > If you charges agaisnt memory owner, the admin has to set _big_ priority of I/O > > > controller to "A" and "B" if it uses much memory. I think the admin can't design > > > his system. It's nonsense to say "plz set I/O limit propotional to memory usage of > > > your apps even if it never do I/O in usual." > > > > > > If this blockio cgroup is introduced, people will see *unexpected* very > > > terrible slow down and the user will see heartbeat warnings/failover by cluster > > > management software. Please do I/O at the priority of memory reclaiming requester. > > > > dm-ioband gives high priority to I/O for swap-out by checking whether > > PG_swapcache flag is set on the I/O page, regardless of the assigned > > I/O bandwidth, and the bandwidth consumed for swap-out is charged to > > the owner of the pages as a debt. > > How about this approach? > > I don't think it's reasonable. Why I/O device, scheduler should know about > such mm-related information ? I think layering is wrong. I think that urgent I/O requests such as swap-out should be notified by setting a special flag in the struct bio, but there is no such mechanism at this time. That is why dm-ioband uses this approach. > And your approatch cannot be a workaround. > > In follwing _typical_ case, > > - A process does small logging to /var/log/mylog, once in a sec. > but it uses some amount of cold memory or shmem. > > This process's logging will be delayed _unexpectedly_ by some buggy process > which does memory leak. Do you mean that the delay in logging is caused since the small process is swapped out unexpectedly by the buggy processes? How about using memory cgroup to prevent the small process from swap-out? I would appreciate it if you could tell me more about this. Thanks, Ryo Tsuruta > > Thanks, > -Kame -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Ryo Tsuruta wrote: > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: >> > dm-ioband gives high priority to I/O for swap-out by checking whether >> > PG_swapcache flag is set on the I/O page, regardless of the assigned >> > I/O bandwidth, and the bandwidth consumed for swap-out is charged to >> > the owner of the pages as a debt. >> > How about this approach? >> >> I don't think it's reasonable. Why I/O device, scheduler should know >> about >> such mm-related information ? I think layering is wrong. > > I think that urgent I/O requests such as swap-out should be notified > by setting a special flag in the struct bio, but there is no such > mechanism at this time. That is why dm-ioband uses this approach. > >> And your approatch cannot be a workaround. >> >> In follwing _typical_ case, >> >> - A process does small logging to /var/log/mylog, once in a sec. >> but it uses some amount of cold memory or shmem. >> >> This process's logging will be delayed _unexpectedly_ by some buggy >> process >> which does memory leak. > > Do you mean that the delay in logging is caused since the small process > is swapped out unexpectedly by the buggy processes? I don't write "small process", "small logging". Buggy process does swap-out and cosumes someone else's bandwidth, then, loggind will be delayed. Important here is throttle bandwidth consumed by buggy prorcess, not other's. > How about using memory cgroup to prevent the small process from swap-out? It never be help if memcg is not configured. My point is "don't allow anyone to use bandwidth of others." Considering job isolation, a thread who requests swap-out should be charged against bandwidth. Thanks, -Kame -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
"KAMEZAWA Hiroyuki" <kamezawa.hiroyu@jp.fujitsu.com> wrote: > Ryo Tsuruta wrote: > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > > >> > dm-ioband gives high priority to I/O for swap-out by checking whethe= > r > >> > PG_swapcache flag is set on the I/O page, regardless of the assigned > >> > I/O bandwidth, and the bandwidth consumed for swap-out is charged to > >> > the owner of the pages as a debt. > >> > How about this approach? > >> > >> I don't think it's reasonable. Why I/O device, scheduler should know > >> about > >> such mm-related information ? I think layering is wrong. > > > > I think that urgent I/O requests such as swap-out should be notified > > by setting a special flag in the struct bio, but there is no such > > mechanism at this time. That is why dm-ioband uses this approach. > > > >> And your approatch cannot be a workaround. > >> > >> In follwing _typical_ case, > >> > >> - A process does small logging to /var/log/mylog, once in a sec. > >> but it uses some amount of cold memory or shmem. > >> > >> This process's logging will be delayed _unexpectedly_ by some buggy > >> process > >> which does memory leak. > > > > Do you mean that the delay in logging is caused since the small process > > is swapped out unexpectedly by the buggy processes? > I don't write "small process", "small logging". > Buggy process does swap-out and cosumes someone else's bandwidth, then, > loggind will be delayed. Important here is throttle bandwidth consumed by > buggy prorcess, not other's. Thank you for explaining it. > > How about using memory cgroup to prevent the small process from swap-ou= > t? > It never be help if memcg is not configured. blkio-cgroup is recommended to use with memcg. I think that it can be a good solution to resolve such problem. > My point is "don't allow anyone to use bandwidth of others." > Considering job isolation, a thread who requests swap-out should be charg= > ed > against bandwidth. >From another perspective, the swap-out is caused since the buggy process uses a large amount of memory, so it can be considered as the bandwidth of logging process is used due to the buggy process. Please consider the following case. If a thread who requests swap-out is charged, the thread is charged other threads' I/O. (1) -------- (2) Process A | | Process B mmaps a large area in --> | memory | <-- tries to allocate a page. the memory and writes | | data to there. -------- (3) | To get a free page, | the data written by Proc.A | is written out to the disk. V The I/O is done by using --------- Proc.B's bandwidth. | disk | --------- Thus I think that page owners should be charged against bandwidth. Thanks, Ryo Tsuruta -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
On Fri, 24 Jul 2009 14:44:16 +0900 (JST) Ryo Tsuruta <ryov@valinux.co.jp> wrote: good solution to resolve such problem. > > > My point is "don't allow anyone to use bandwidth of others." > > Considering job isolation, a thread who requests swap-out should be charg= > > ed > > against bandwidth. > > From another perspective, the swap-out is caused since the buggy > process uses a large amount of memory, so it can be considered as > the bandwidth of logging process is used due to the buggy process. > > Please consider the following case. If a thread who requests swap-out > is charged, the thread is charged other threads' I/O. > > (1) -------- (2) > Process A | | Process B > mmaps a large area in --> | memory | <-- tries to allocate a page. > the memory and writes | | > data to there. -------- (3) > | To get a free page, > | the data written by Proc.A > | is written out to the disk. > V The I/O is done by using > --------- Proc.B's bandwidth. > | disk | > --------- > > Thus I think that page owners should be charged against bandwidth. > Ok, no good way. yours is wrong, mine is wrong, too. plz find 3rd way, reasonable. Below is brief thinking. "Why process A should be charged to I/O when it just maps anon memory ?" I can't answer this. Even in yorr case, Process B requests memory and get penalty. It's very natural, I think. In usual case, - if process A maps ANON, there will be no I/O. - if process A maps FILE, it will be charged to process A. ok ? Under memory pressure, - if process A maps ANON, swap I/O should be charged to process B. - if process A maps FILE, I/O should be charged to process A. maybe. Anyway, there will be ineraction with dirty_ratio of memcg (not implemeted yet) and _Owner should be charged_ issue will be handled in this dirty_ratio layer. More consideration is necessary, I think. Bye, -Kame -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > On Fri, 24 Jul 2009 14:44:16 +0900 (JST) > Ryo Tsuruta <ryov@valinux.co.jp> wrote: > good solution to resolve such problem. > > > > > My point is "don't allow anyone to use bandwidth of others." > > > Considering job isolation, a thread who requests swap-out should be charg= > > > ed > > > against bandwidth. > > > > From another perspective, the swap-out is caused since the buggy > > process uses a large amount of memory, so it can be considered as > > the bandwidth of logging process is used due to the buggy process. > > > > Please consider the following case. If a thread who requests swap-out > > is charged, the thread is charged other threads' I/O. > > > > (1) -------- (2) > > Process A | | Process B > > mmaps a large area in --> | memory | <-- tries to allocate a page. > > the memory and writes | | > > data to there. -------- (3) > > | To get a free page, > > | the data written by Proc.A > > | is written out to the disk. > > V The I/O is done by using > > --------- Proc.B's bandwidth. > > | disk | > > --------- > > > > Thus I think that page owners should be charged against bandwidth. > > > Ok, no good way. yours is wrong, mine is wrong, too. > plz find 3rd way, reasonable. > > Below is brief thinking. > > "Why process A should be charged to I/O when it just maps anon memory ?" > I can't answer this. > > Even in yorr case, Process B requests memory and get penalty. It's > very natural, I think. > > In usual case, > - if process A maps ANON, there will be no I/O. > - if process A maps FILE, it will be charged to process A. > ok ? > > Under memory pressure, > - if process A maps ANON, swap I/O should be charged to process B. > - if process A maps FILE, I/O should be charged to process A. > maybe. I think that even process A maps ANON, it should be charged to process A because the memory pressure is caused by process A. It seems natual for me that a process which consumes more resources is more likely to get penalty. > Anyway, there will be ineraction with dirty_ratio of memcg (not implemeted yet) > and _Owner should be charged_ issue will be handled in this dirty_ratio layer. > More consideration is necessary, I think. I'll keep thinking how it should be done. Thanks, Ryo Tsuruta -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Index: linux-2.6.31-rc3/fs/buffer.c =================================================================== --- linux-2.6.31-rc3.orig/fs/buffer.c +++ linux-2.6.31-rc3/fs/buffer.c @@ -36,6 +36,7 @@ #include <linux/buffer_head.h> #include <linux/task_io_accounting_ops.h> #include <linux/bio.h> +#include <linux/biotrack.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/bitops.h> @@ -668,6 +669,7 @@ static void __set_page_dirty(struct page if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } Index: linux-2.6.31-rc3/fs/direct-io.c =================================================================== --- linux-2.6.31-rc3.orig/fs/direct-io.c +++ linux-2.6.31-rc3/fs/direct-io.c @@ -33,6 +33,7 @@ #include <linux/err.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/biotrack.h> #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> @@ -797,6 +798,7 @@ static int do_direct_IO(struct dio *dio) ret = PTR_ERR(page); goto out; } + blkio_cgroup_reset_owner(page, current->mm); while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; Index: linux-2.6.31-rc3/mm/bounce.c =================================================================== --- linux-2.6.31-rc3.orig/mm/bounce.c +++ linux-2.6.31-rc3/mm/bounce.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/biotrack.h> #include <asm/tlbflush.h> #include <trace/events/block.h> @@ -210,6 +211,7 @@ static void __blk_queue_bounce(struct re to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + blkio_cgroup_copy_owner(to->bv_page, page); if (rw == WRITE) { char *vto, *vfrom; Index: linux-2.6.31-rc3/mm/filemap.c =================================================================== --- linux-2.6.31-rc3.orig/mm/filemap.c +++ linux-2.6.31-rc3/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> +#include <linux/biotrack.h> #include <linux/mm_inline.h> /* for page_is_file_cache() */ #include "internal.h" @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; + blkio_cgroup_set_owner(page, current->mm); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { Index: linux-2.6.31-rc3/mm/memory.c =================================================================== --- linux-2.6.31-rc3.orig/mm/memory.c +++ linux-2.6.31-rc3/mm/memory.c @@ -51,6 +51,7 @@ #include <linux/init.h> #include <linux/writeback.h> #include <linux/memcontrol.h> +#include <linux/biotrack.h> #include <linux/mmu_notifier.h> #include <linux/kallsyms.h> #include <linux/swapops.h> @@ -2115,6 +2116,7 @@ gotten: */ ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + blkio_cgroup_set_owner(new_page, mm); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + blkio_cgroup_reset_owner(page, mm); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s goto release; inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct * if (anon) { inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); Index: linux-2.6.31-rc3/mm/page-writeback.c =================================================================== --- linux-2.6.31-rc3.orig/mm/page-writeback.c +++ linux-2.6.31-rc3/mm/page-writeback.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> +#include <linux/biotrack.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> @@ -1247,6 +1248,7 @@ int __set_page_dirty_nobuffers(struct pa BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } Index: linux-2.6.31-rc3/mm/swap_state.c =================================================================== --- linux-2.6.31-rc3.orig/mm/swap_state.c +++ linux-2.6.31-rc3/mm/swap_state.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/migrate.h> #include <linux/page_cgroup.h> +#include <linux/biotrack.h> #include <asm/pgtable.h> @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_e */ __set_page_locked(new_page); SetPageSwapBacked(new_page); + blkio_cgroup_set_owner(new_page, current->mm); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /*