Message ID | 20200629234507.CA0FDE19@viggo.jf.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Migrate Pages in lieu of discard | expand |
Dave Hansen <dave.hansen@linux.intel.com> wrote: > From: Keith Busch <kbusch@kernel.org> > > Migrating pages had been allocating the new page before it was actually > needed. Subsequent operations may still fail, which would have to handle > cleaning up the newly allocated page when it was never used. > > Defer allocating the page until we are actually ready to make use of > it, after locking the original page. This simplifies error handling, > but should not have any functional change in behavior. This is just > refactoring page migration so the main part can more easily be reused > by other code. Is there any concern that the src page is now held PG_locked over the dst page allocation, which might wander into reclaim/cond_resched/oom_kill? I don't have a deadlock in mind. I'm just wondering about the additional latency imposed on unrelated threads who want access src page. > #Signed-off-by: Keith Busch <keith.busch@intel.com> Is commented Signed-off-by intentional? Same applies to later patches. > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> > Cc: Keith Busch <kbusch@kernel.org> > Cc: Yang Shi <yang.shi@linux.alibaba.com> > Cc: David Rientjes <rientjes@google.com> > Cc: Huang Ying <ying.huang@intel.com> > Cc: Dan Williams <dan.j.williams@intel.com> > --- > > b/mm/migrate.c | 148 ++++++++++++++++++++++++++++----------------------------- > 1 file changed, 75 insertions(+), 73 deletions(-) > > diff -puN mm/migrate.c~0007-mm-migrate-Defer-allocating-new-page-until-needed mm/migrate.c > --- a/mm/migrate.c~0007-mm-migrate-Defer-allocating-new-page-until-needed 2020-06-29 16:34:37.896312607 -0700 > +++ b/mm/migrate.c 2020-06-29 16:34:37.900312607 -0700 > @@ -1014,56 +1014,17 @@ out: > return rc; > } > > -static int __unmap_and_move(struct page *page, struct page *newpage, > - int force, enum migrate_mode mode) > +static int __unmap_and_move(new_page_t get_new_page, > + free_page_t put_new_page, > + unsigned long private, struct page *page, > + enum migrate_mode mode, > + enum migrate_reason reason) > { > int rc = -EAGAIN; > int page_was_mapped = 0; > struct anon_vma *anon_vma = NULL; > bool is_lru = !__PageMovable(page); > - > - if (!trylock_page(page)) { > - if (!force || mode == MIGRATE_ASYNC) > - goto out; > - > - /* > - * It's not safe for direct compaction to call lock_page. > - * For example, during page readahead pages are added locked > - * to the LRU. Later, when the IO completes the pages are > - * marked uptodate and unlocked. However, the queueing > - * could be merging multiple pages for one bio (e.g. > - * mpage_readpages). If an allocation happens for the > - * second or third page, the process can end up locking > - * the same page twice and deadlocking. Rather than > - * trying to be clever about what pages can be locked, > - * avoid the use of lock_page for direct compaction > - * altogether. > - */ > - if (current->flags & PF_MEMALLOC) > - goto out; > - > - lock_page(page); > - } > - > - if (PageWriteback(page)) { > - /* > - * Only in the case of a full synchronous migration is it > - * necessary to wait for PageWriteback. In the async case, > - * the retry loop is too short and in the sync-light case, > - * the overhead of stalling is too much > - */ > - switch (mode) { > - case MIGRATE_SYNC: > - case MIGRATE_SYNC_NO_COPY: > - break; > - default: > - rc = -EBUSY; > - goto out_unlock; > - } > - if (!force) > - goto out_unlock; > - wait_on_page_writeback(page); > - } > + struct page *newpage; > > /* > * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, > @@ -1082,6 +1043,12 @@ static int __unmap_and_move(struct page > if (PageAnon(page) && !PageKsm(page)) > anon_vma = page_get_anon_vma(page); > > + newpage = get_new_page(page, private); > + if (!newpage) { > + rc = -ENOMEM; > + goto out; > + } > + > /* > * Block others from accessing the new page when we get around to > * establishing additional references. We are usually the only one > @@ -1091,11 +1058,11 @@ static int __unmap_and_move(struct page > * This is much like races on refcount of oldpage: just don't BUG(). > */ > if (unlikely(!trylock_page(newpage))) > - goto out_unlock; > + goto out_put; > > if (unlikely(!is_lru)) { > rc = move_to_new_page(newpage, page, mode); > - goto out_unlock_both; > + goto out_unlock; > } > > /* > @@ -1114,7 +1081,7 @@ static int __unmap_and_move(struct page > VM_BUG_ON_PAGE(PageAnon(page), page); > if (page_has_private(page)) { > try_to_free_buffers(page); > - goto out_unlock_both; > + goto out_unlock; > } > } else if (page_mapped(page)) { > /* Establish migration ptes */ > @@ -1131,15 +1098,9 @@ static int __unmap_and_move(struct page > if (page_was_mapped) > remove_migration_ptes(page, > rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); > - > -out_unlock_both: > - unlock_page(newpage); > out_unlock: > - /* Drop an anon_vma reference if we took one */ > - if (anon_vma) > - put_anon_vma(anon_vma); > - unlock_page(page); > -out: > + unlock_page(newpage); > +out_put: > /* > * If migration is successful, decrease refcount of the newpage > * which will not free the page because new page owner increased > @@ -1150,12 +1111,20 @@ out: > * state. > */ > if (rc == MIGRATEPAGE_SUCCESS) { > + set_page_owner_migrate_reason(newpage, reason); > if (unlikely(!is_lru)) > put_page(newpage); > else > putback_lru_page(newpage); > + } else if (put_new_page) { > + put_new_page(newpage, private); > + } else { > + put_page(newpage); > } > - > +out: > + /* Drop an anon_vma reference if we took one */ > + if (anon_vma) > + put_anon_vma(anon_vma); > return rc; > } > > @@ -1203,8 +1172,7 @@ static ICE_noinline int unmap_and_move(n > int force, enum migrate_mode mode, > enum migrate_reason reason) > { > - int rc = MIGRATEPAGE_SUCCESS; > - struct page *newpage = NULL; > + int rc = -EAGAIN; > > if (!thp_migration_supported() && PageTransHuge(page)) > return -ENOMEM; > @@ -1219,17 +1187,57 @@ static ICE_noinline int unmap_and_move(n > __ClearPageIsolated(page); > unlock_page(page); > } > + rc = MIGRATEPAGE_SUCCESS; > goto out; > } > > - newpage = get_new_page(page, private); > - if (!newpage) > - return -ENOMEM; > + if (!trylock_page(page)) { > + if (!force || mode == MIGRATE_ASYNC) > + return rc; > > - rc = __unmap_and_move(page, newpage, force, mode); > - if (rc == MIGRATEPAGE_SUCCESS) > - set_page_owner_migrate_reason(newpage, reason); > + /* > + * It's not safe for direct compaction to call lock_page. > + * For example, during page readahead pages are added locked > + * to the LRU. Later, when the IO completes the pages are > + * marked uptodate and unlocked. However, the queueing > + * could be merging multiple pages for one bio (e.g. > + * mpage_readpages). If an allocation happens for the > + * second or third page, the process can end up locking > + * the same page twice and deadlocking. Rather than > + * trying to be clever about what pages can be locked, > + * avoid the use of lock_page for direct compaction > + * altogether. > + */ > + if (current->flags & PF_MEMALLOC) > + return rc; > + > + lock_page(page); > + } > + > + if (PageWriteback(page)) { > + /* > + * Only in the case of a full synchronous migration is it > + * necessary to wait for PageWriteback. In the async case, > + * the retry loop is too short and in the sync-light case, > + * the overhead of stalling is too much > + */ > + switch (mode) { > + case MIGRATE_SYNC: > + case MIGRATE_SYNC_NO_COPY: > + break; > + default: > + rc = -EBUSY; > + goto out_unlock; > + } > + if (!force) > + goto out_unlock; > + wait_on_page_writeback(page); > + } > + rc = __unmap_and_move(get_new_page, put_new_page, private, > + page, mode, reason); > > +out_unlock: > + unlock_page(page); > out: > if (rc != -EAGAIN) { > /* > @@ -1269,9 +1277,8 @@ out: > if (rc != -EAGAIN) { > if (likely(!__PageMovable(page))) { > putback_lru_page(page); > - goto put_new; > + goto done; > } > - > lock_page(page); > if (PageMovable(page)) > putback_movable_page(page); > @@ -1280,13 +1287,8 @@ out: > unlock_page(page); > put_page(page); > } > -put_new: > - if (put_new_page) > - put_new_page(newpage, private); > - else > - put_page(newpage); > } > - > +done: > return rc; > } > > _
On 7/1/20 1:47 AM, Greg Thelen wrote: > Dave Hansen <dave.hansen@linux.intel.com> wrote: >> From: Keith Busch <kbusch@kernel.org> >> Defer allocating the page until we are actually ready to make use of >> it, after locking the original page. This simplifies error handling, >> but should not have any functional change in behavior. This is just >> refactoring page migration so the main part can more easily be reused >> by other code. > > Is there any concern that the src page is now held PG_locked over the > dst page allocation, which might wander into > reclaim/cond_resched/oom_kill? I don't have a deadlock in mind. I'm > just wondering about the additional latency imposed on unrelated threads > who want access src page. It's not great. *But*, the alternative is to toss the page contents out and let users encounter a fault and an allocation. They would be subject to all the latency associated with an allocation, just at a slightly later time. If it's a problem it seems like it would be pretty easy to fix, at least for non-cgroup reclaim. We know which node we're reclaiming from and we know if it has a demotion path, so we could proactively allocate a single migration target page before doing the source lock_page(). That creates some other problems, but I think it would be straightforward. >> #Signed-off-by: Keith Busch <keith.busch@intel.com> > > Is commented Signed-off-by intentional? Same applies to later patches. Yes, Keith is no longer at Intel, so that @intel.com mail would bounce. I left the @intel.com SoB so it would be clear that the code originated from Keith while at Intel, but commented it out to avoid it being picked up by anyone's tooling.
On 7/1/20 7:46 AM, Dave Hansen wrote: > On 7/1/20 1:47 AM, Greg Thelen wrote: >> Dave Hansen <dave.hansen@linux.intel.com> wrote: >>> From: Keith Busch <kbusch@kernel.org> >>> Defer allocating the page until we are actually ready to make use of >>> it, after locking the original page. This simplifies error handling, >>> but should not have any functional change in behavior. This is just >>> refactoring page migration so the main part can more easily be reused >>> by other code. >> Is there any concern that the src page is now held PG_locked over the >> dst page allocation, which might wander into >> reclaim/cond_resched/oom_kill? I don't have a deadlock in mind. I'm >> just wondering about the additional latency imposed on unrelated threads >> who want access src page. > It's not great. *But*, the alternative is to toss the page contents out > and let users encounter a fault and an allocation. They would be > subject to all the latency associated with an allocation, just at a > slightly later time. > > If it's a problem it seems like it would be pretty easy to fix, at least > for non-cgroup reclaim. We know which node we're reclaiming from and we > know if it has a demotion path, so we could proactively allocate a > single migration target page before doing the source lock_page(). That > creates some other problems, but I think it would be straightforward. If so this patch looks pointless if I read it correctly. The patch defers page allocation in __unmap_and_move() under page lock so that __unmap_and _move() can be called in reclaim path since the src page is locked in reclaim path before calling __unmap_and_move() otherwise it would deadlock itself. Actually you always allocate target page with src page locked with this implementation unless you move the target page allocation before shrink_page_list(), but the problem is you don't know how many pages you need allocate. The alternative may be to unlock the src page then allocate target page then lock src page again. But if so why not just call migrate_pages() directly as I did in my series? It put the src page on a separate list then unlock it, then migrate themn in batch later. >>> #Signed-off-by: Keith Busch <keith.busch@intel.com> >> Is commented Signed-off-by intentional? Same applies to later patches. > Yes, Keith is no longer at Intel, so that @intel.com mail would bounce. > I left the @intel.com SoB so it would be clear that the code originated > from Keith while at Intel, but commented it out to avoid it being picked > up by anyone's tooling.
diff -puN mm/migrate.c~0007-mm-migrate-Defer-allocating-new-page-until-needed mm/migrate.c --- a/mm/migrate.c~0007-mm-migrate-Defer-allocating-new-page-until-needed 2020-06-29 16:34:37.896312607 -0700 +++ b/mm/migrate.c 2020-06-29 16:34:37.900312607 -0700 @@ -1014,56 +1014,17 @@ out: return rc; } -static int __unmap_and_move(struct page *page, struct page *newpage, - int force, enum migrate_mode mode) +static int __unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + enum migrate_mode mode, + enum migrate_reason reason) { int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(page); - - if (!trylock_page(page)) { - if (!force || mode == MIGRATE_ASYNC) - goto out; - - /* - * It's not safe for direct compaction to call lock_page. - * For example, during page readahead pages are added locked - * to the LRU. Later, when the IO completes the pages are - * marked uptodate and unlocked. However, the queueing - * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the - * second or third page, the process can end up locking - * the same page twice and deadlocking. Rather than - * trying to be clever about what pages can be locked, - * avoid the use of lock_page for direct compaction - * altogether. - */ - if (current->flags & PF_MEMALLOC) - goto out; - - lock_page(page); - } - - if (PageWriteback(page)) { - /* - * Only in the case of a full synchronous migration is it - * necessary to wait for PageWriteback. In the async case, - * the retry loop is too short and in the sync-light case, - * the overhead of stalling is too much - */ - switch (mode) { - case MIGRATE_SYNC: - case MIGRATE_SYNC_NO_COPY: - break; - default: - rc = -EBUSY; - goto out_unlock; - } - if (!force) - goto out_unlock; - wait_on_page_writeback(page); - } + struct page *newpage; /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, @@ -1082,6 +1043,12 @@ static int __unmap_and_move(struct page if (PageAnon(page) && !PageKsm(page)) anon_vma = page_get_anon_vma(page); + newpage = get_new_page(page, private); + if (!newpage) { + rc = -ENOMEM; + goto out; + } + /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one @@ -1091,11 +1058,11 @@ static int __unmap_and_move(struct page * This is much like races on refcount of oldpage: just don't BUG(). */ if (unlikely(!trylock_page(newpage))) - goto out_unlock; + goto out_put; if (unlikely(!is_lru)) { rc = move_to_new_page(newpage, page, mode); - goto out_unlock_both; + goto out_unlock; } /* @@ -1114,7 +1081,7 @@ static int __unmap_and_move(struct page VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto out_unlock_both; + goto out_unlock; } } else if (page_mapped(page)) { /* Establish migration ptes */ @@ -1131,15 +1098,9 @@ static int __unmap_and_move(struct page if (page_was_mapped) remove_migration_ptes(page, rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); - -out_unlock_both: - unlock_page(newpage); out_unlock: - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); - unlock_page(page); -out: + unlock_page(newpage); +out_put: /* * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased @@ -1150,12 +1111,20 @@ out: * state. */ if (rc == MIGRATEPAGE_SUCCESS) { + set_page_owner_migrate_reason(newpage, reason); if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); + } else if (put_new_page) { + put_new_page(newpage, private); + } else { + put_page(newpage); } - +out: + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); return rc; } @@ -1203,8 +1172,7 @@ static ICE_noinline int unmap_and_move(n int force, enum migrate_mode mode, enum migrate_reason reason) { - int rc = MIGRATEPAGE_SUCCESS; - struct page *newpage = NULL; + int rc = -EAGAIN; if (!thp_migration_supported() && PageTransHuge(page)) return -ENOMEM; @@ -1219,17 +1187,57 @@ static ICE_noinline int unmap_and_move(n __ClearPageIsolated(page); unlock_page(page); } + rc = MIGRATEPAGE_SUCCESS; goto out; } - newpage = get_new_page(page, private); - if (!newpage) - return -ENOMEM; + if (!trylock_page(page)) { + if (!force || mode == MIGRATE_ASYNC) + return rc; - rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(newpage, reason); + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + return rc; + + lock_page(page); + } + + if (PageWriteback(page)) { + /* + * Only in the case of a full synchronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much + */ + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: + rc = -EBUSY; + goto out_unlock; + } + if (!force) + goto out_unlock; + wait_on_page_writeback(page); + } + rc = __unmap_and_move(get_new_page, put_new_page, private, + page, mode, reason); +out_unlock: + unlock_page(page); out: if (rc != -EAGAIN) { /* @@ -1269,9 +1277,8 @@ out: if (rc != -EAGAIN) { if (likely(!__PageMovable(page))) { putback_lru_page(page); - goto put_new; + goto done; } - lock_page(page); if (PageMovable(page)) putback_movable_page(page); @@ -1280,13 +1287,8 @@ out: unlock_page(page); put_page(page); } -put_new: - if (put_new_page) - put_new_page(newpage, private); - else - put_page(newpage); } - +done: return rc; }