diff mbox series

[v3,5/6] mm/gup: migrate pinned pages out of movable zone

Message ID 20201211202140.396852-6-pasha.tatashin@soleen.com (mailing list archive)
State New, archived
Headers show
Series prohibit pinning pages in ZONE_MOVABLE | expand

Commit Message

Pasha Tatashin Dec. 11, 2020, 8:21 p.m. UTC
We should not pin pages in ZONE_MOVABLE. Currently, we do not pin only
movable CMA pages. Generalize the function that migrates CMA pages to
migrate all movable pages. Use is_pinnable_page() to check which
pages need to be migrated

Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/migrate.h        |  1 +
 include/linux/mmzone.h         | 11 ++++--
 include/trace/events/migrate.h |  3 +-
 mm/gup.c                       | 66 ++++++++++++++--------------------
 4 files changed, 38 insertions(+), 43 deletions(-)

Comments

Jason Gunthorpe Dec. 11, 2020, 8:23 p.m. UTC | #1
On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
> @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>  				}
>  
>  				if (!isolate_lru_page(head)) {
> -					list_add_tail(&head->lru, &cma_page_list);
> +					list_add_tail(&head->lru, &movable_page_list);
>  					mod_node_page_state(page_pgdat(head),
>  							    NR_ISOLATED_ANON +
>  							    page_is_file_lru(head),
> @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>  		i += step;
>  	}
>  
> -	if (!list_empty(&cma_page_list)) {
> +	if (!list_empty(&movable_page_list)) {

You didn't answer my earlier question, is it OK that ZONE_MOVABLE
pages leak out here if ioslate_lru_page() fails but the
moval_page_list is empty? 

I think the answer is no, right?

Jason
Pasha Tatashin Dec. 11, 2020, 8:40 p.m. UTC | #2
On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
> > @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> >                               }
> >
> >                               if (!isolate_lru_page(head)) {
> > -                                     list_add_tail(&head->lru, &cma_page_list);
> > +                                     list_add_tail(&head->lru, &movable_page_list);
> >                                       mod_node_page_state(page_pgdat(head),
> >                                                           NR_ISOLATED_ANON +
> >                                                           page_is_file_lru(head),
> > @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> >               i += step;
> >       }
> >
> > -     if (!list_empty(&cma_page_list)) {
> > +     if (!list_empty(&movable_page_list)) {
>
> You didn't answer my earlier question, is it OK that ZONE_MOVABLE
> pages leak out here if ioslate_lru_page() fails but the
> moval_page_list is empty?
>
> I think the answer is no, right?
In my opinion it is OK. We are doing our best to not pin movable
pages, but if isolate_lru_page() fails because pages are currently
locked by someone else, we will end up long-term pinning them.
See comment in this patch:
+        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
+        *    when pages are pinned and faulted, but it is still possible that
+        *    address space already has pages in ZONE_MOVABLE at the time when
+        *    pages are pinned (i.e. user has touches that memory before
+        *    pinning). In such case we try to migrate them to a different zone,
+        *    but if migration fails the pages can still end-up pinned in
+        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
+        *    time and will only succeed once user application unpins pages.


>
> Jason
Jason Gunthorpe Dec. 11, 2020, 8:46 p.m. UTC | #3
On Fri, Dec 11, 2020 at 03:40:57PM -0500, Pavel Tatashin wrote:
> On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
> > > @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> > >                               }
> > >
> > >                               if (!isolate_lru_page(head)) {
> > > -                                     list_add_tail(&head->lru, &cma_page_list);
> > > +                                     list_add_tail(&head->lru, &movable_page_list);
> > >                                       mod_node_page_state(page_pgdat(head),
> > >                                                           NR_ISOLATED_ANON +
> > >                                                           page_is_file_lru(head),
> > > @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> > >               i += step;
> > >       }
> > >
> > > -     if (!list_empty(&cma_page_list)) {
> > > +     if (!list_empty(&movable_page_list)) {
> >
> > You didn't answer my earlier question, is it OK that ZONE_MOVABLE
> > pages leak out here if ioslate_lru_page() fails but the
> > moval_page_list is empty?
> >
> > I think the answer is no, right?
> In my opinion it is OK. We are doing our best to not pin movable
> pages, but if isolate_lru_page() fails because pages are currently
> locked by someone else, we will end up long-term pinning them.
> See comment in this patch:
> +        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
> +        *    when pages are pinned and faulted, but it is still possible that
> +        *    address space already has pages in ZONE_MOVABLE at the time when
> +        *    pages are pinned (i.e. user has touches that memory before
> +        *    pinning). In such case we try to migrate them to a different zone,
> +        *    but if migration fails the pages can still end-up pinned in
> +        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
> +        *    time and will only succeed once user application unpins pages.

It is not "retry a long time" it is "might never complete" because
userspace will hold the DMA pin indefinitely.

Confused what the point of all this is then ??

I thought to goal here is to make memory unplug reliable, if you leave
a hole like this then any hostile userspace can block it forever.

Jason
Pasha Tatashin Dec. 11, 2020, 9:09 p.m. UTC | #4
On Fri, Dec 11, 2020 at 3:46 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Dec 11, 2020 at 03:40:57PM -0500, Pavel Tatashin wrote:
> > On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
> > > > @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> > > >                               }
> > > >
> > > >                               if (!isolate_lru_page(head)) {
> > > > -                                     list_add_tail(&head->lru, &cma_page_list);
> > > > +                                     list_add_tail(&head->lru, &movable_page_list);
> > > >                                       mod_node_page_state(page_pgdat(head),
> > > >                                                           NR_ISOLATED_ANON +
> > > >                                                           page_is_file_lru(head),
> > > > @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> > > >               i += step;
> > > >       }
> > > >
> > > > -     if (!list_empty(&cma_page_list)) {
> > > > +     if (!list_empty(&movable_page_list)) {
> > >
> > > You didn't answer my earlier question, is it OK that ZONE_MOVABLE
> > > pages leak out here if ioslate_lru_page() fails but the
> > > moval_page_list is empty?
> > >
> > > I think the answer is no, right?
> > In my opinion it is OK. We are doing our best to not pin movable
> > pages, but if isolate_lru_page() fails because pages are currently
> > locked by someone else, we will end up long-term pinning them.
> > See comment in this patch:
> > +        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
> > +        *    when pages are pinned and faulted, but it is still possible that
> > +        *    address space already has pages in ZONE_MOVABLE at the time when
> > +        *    pages are pinned (i.e. user has touches that memory before
> > +        *    pinning). In such case we try to migrate them to a different zone,
> > +        *    but if migration fails the pages can still end-up pinned in
> > +        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
> > +        *    time and will only succeed once user application unpins pages.
>
> It is not "retry a long time" it is "might never complete" because
> userspace will hold the DMA pin indefinitely.
>
> Confused what the point of all this is then ??
>
> I thought to goal here is to make memory unplug reliable, if you leave
> a hole like this then any hostile userspace can block it forever.

You are right, I used a wording from the previous comment, and it
should be made clear that pin may be forever. Without these patches it
is guaranteed that hot-remove will fail if there are pinned pages as
ZONE_MOVABLE is actually the first to be searched. Now, it will fail
only due to exceptions listed in ZONE_MOVABLE comment:

1. pin + migration/isolation failure
2. memblock allocation due to limited amount of space for kernelcore
3. memory holes
4. hwpoison
5. Unmovable PG_offline pages (? need to study why this is a scenario).

Do you think we should unconditionally unpin pages, and return error
when isolation/migration fails?

Pasha

>
> Jason
David Hildenbrand Dec. 11, 2020, 9:29 p.m. UTC | #5
> Am 11.12.2020 um 22:09 schrieb Pavel Tatashin <pasha.tatashin@soleen.com>:
> 
> On Fri, Dec 11, 2020 at 3:46 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>> 
>>> On Fri, Dec 11, 2020 at 03:40:57PM -0500, Pavel Tatashin wrote:
>>> On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>>> 
>>>> On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
>>>>> @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>>>>>                              }
>>>>> 
>>>>>                              if (!isolate_lru_page(head)) {
>>>>> -                                     list_add_tail(&head->lru, &cma_page_list);
>>>>> +                                     list_add_tail(&head->lru, &movable_page_list);
>>>>>                                      mod_node_page_state(page_pgdat(head),
>>>>>                                                          NR_ISOLATED_ANON +
>>>>>                                                          page_is_file_lru(head),
>>>>> @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>>>>>              i += step;
>>>>>      }
>>>>> 
>>>>> -     if (!list_empty(&cma_page_list)) {
>>>>> +     if (!list_empty(&movable_page_list)) {
>>>> 
>>>> You didn't answer my earlier question, is it OK that ZONE_MOVABLE
>>>> pages leak out here if ioslate_lru_page() fails but the
>>>> moval_page_list is empty?
>>>> 
>>>> I think the answer is no, right?
>>> In my opinion it is OK. We are doing our best to not pin movable
>>> pages, but if isolate_lru_page() fails because pages are currently
>>> locked by someone else, we will end up long-term pinning them.
>>> See comment in this patch:
>>> +        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
>>> +        *    when pages are pinned and faulted, but it is still possible that
>>> +        *    address space already has pages in ZONE_MOVABLE at the time when
>>> +        *    pages are pinned (i.e. user has touches that memory before
>>> +        *    pinning). In such case we try to migrate them to a different zone,
>>> +        *    but if migration fails the pages can still end-up pinned in
>>> +        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
>>> +        *    time and will only succeed once user application unpins pages.
>> 
>> It is not "retry a long time" it is "might never complete" because
>> userspace will hold the DMA pin indefinitely.
>> 
>> Confused what the point of all this is then ??
>> 
>> I thought to goal here is to make memory unplug reliable, if you leave
>> a hole like this then any hostile userspace can block it forever.
> 
> You are right, I used a wording from the previous comment, and it
> should be made clear that pin may be forever. Without these patches it
> is guaranteed that hot-remove will fail if there are pinned pages as
> ZONE_MOVABLE is actually the first to be searched. Now, it will fail
> only due to exceptions listed in ZONE_MOVABLE comment:
> 
> 1. pin + migration/isolation failure

Not sure what that really means. We have short-term pinnings (although we might have a better term for „pinning“ here) for example, when a process dies (IIRC). There is a period where pages cannot get migrated and offlining code has to retry (which might take a while). This still applies after your change - are you referring to that?

> 2. memblock allocation due to limited amount of space for kernelcore
> 3. memory holes
> 4. hwpoison
> 5. Unmovable PG_offline pages (? need to study why this is a scenario).

Virtio-mem is the primary user in this context.

> Do you think we should unconditionally unpin pages, and return error
> when isolation/migration fails?

I‘m not sure what you mean here. Who’s supposed to unpin which pages?

> 
> Pasha
> 
>> 
>> Jason
>
Pasha Tatashin Dec. 11, 2020, 9:35 p.m. UTC | #6
On Fri, Dec 11, 2020 at 4:29 PM David Hildenbrand <david@redhat.com> wrote:
>
>
> > Am 11.12.2020 um 22:09 schrieb Pavel Tatashin <pasha.tatashin@soleen.com>:
> >
> > On Fri, Dec 11, 2020 at 3:46 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >>
> >>> On Fri, Dec 11, 2020 at 03:40:57PM -0500, Pavel Tatashin wrote:
> >>> On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >>>>
> >>>> On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
> >>>>> @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> >>>>>                              }
> >>>>>
> >>>>>                              if (!isolate_lru_page(head)) {
> >>>>> -                                     list_add_tail(&head->lru, &cma_page_list);
> >>>>> +                                     list_add_tail(&head->lru, &movable_page_list);
> >>>>>                                      mod_node_page_state(page_pgdat(head),
> >>>>>                                                          NR_ISOLATED_ANON +
> >>>>>                                                          page_is_file_lru(head),
> >>>>> @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
> >>>>>              i += step;
> >>>>>      }
> >>>>>
> >>>>> -     if (!list_empty(&cma_page_list)) {
> >>>>> +     if (!list_empty(&movable_page_list)) {
> >>>>
> >>>> You didn't answer my earlier question, is it OK that ZONE_MOVABLE
> >>>> pages leak out here if ioslate_lru_page() fails but the
> >>>> moval_page_list is empty?
> >>>>
> >>>> I think the answer is no, right?
> >>> In my opinion it is OK. We are doing our best to not pin movable
> >>> pages, but if isolate_lru_page() fails because pages are currently
> >>> locked by someone else, we will end up long-term pinning them.
> >>> See comment in this patch:
> >>> +        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
> >>> +        *    when pages are pinned and faulted, but it is still possible that
> >>> +        *    address space already has pages in ZONE_MOVABLE at the time when
> >>> +        *    pages are pinned (i.e. user has touches that memory before
> >>> +        *    pinning). In such case we try to migrate them to a different zone,
> >>> +        *    but if migration fails the pages can still end-up pinned in
> >>> +        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
> >>> +        *    time and will only succeed once user application unpins pages.
> >>
> >> It is not "retry a long time" it is "might never complete" because
> >> userspace will hold the DMA pin indefinitely.
> >>
> >> Confused what the point of all this is then ??
> >>
> >> I thought to goal here is to make memory unplug reliable, if you leave
> >> a hole like this then any hostile userspace can block it forever.
> >
> > You are right, I used a wording from the previous comment, and it
> > should be made clear that pin may be forever. Without these patches it
> > is guaranteed that hot-remove will fail if there are pinned pages as
> > ZONE_MOVABLE is actually the first to be searched. Now, it will fail
> > only due to exceptions listed in ZONE_MOVABLE comment:
> >
> > 1. pin + migration/isolation failure
>
> Not sure what that really means. We have short-term pinnings (although we might have a better term for „pinning“ here) for example, when a process dies (IIRC). There is a period where pages cannot get migrated and offlining code has to retry (which might take a while). This still applies after your change - are you referring to that?
>
> > 2. memblock allocation due to limited amount of space for kernelcore
> > 3. memory holes
> > 4. hwpoison
> > 5. Unmovable PG_offline pages (? need to study why this is a scenario).
>
> Virtio-mem is the primary user in this context.
>
> > Do you think we should unconditionally unpin pages, and return error
> > when isolation/migration fails?
>
> I‘m not sure what you mean here. Who’s supposed to unpin which pages?

Hi David,

When check_and_migrate_movable_pages() is called, the pages are
already pinned. If some of those pages are in movable zone, and we
fail to migrate or isolate them what should we do: proceed, and keep
it as exception of when movable zone can actually have pinned pages or
unpin all pages in the array, and return an error, or unpin only pages
in movable zone, and return an error?

Pasha

>
> >
> > Pasha
> >
> >>
> >> Jason
> >
>
David Hildenbrand Dec. 11, 2020, 9:53 p.m. UTC | #7
> Am 11.12.2020 um 22:36 schrieb Pavel Tatashin <pasha.tatashin@soleen.com>:
> 
> On Fri, Dec 11, 2020 at 4:29 PM David Hildenbrand <david@redhat.com> wrote:
>> 
>> 
>>>> Am 11.12.2020 um 22:09 schrieb Pavel Tatashin <pasha.tatashin@soleen.com>:
>>> 
>>> On Fri, Dec 11, 2020 at 3:46 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>>> 
>>>>> On Fri, Dec 11, 2020 at 03:40:57PM -0500, Pavel Tatashin wrote:
>>>>> On Fri, Dec 11, 2020 at 3:23 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>>>>> 
>>>>>> On Fri, Dec 11, 2020 at 03:21:39PM -0500, Pavel Tatashin wrote:
>>>>>>> @@ -1593,7 +1592,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>>>>>>>                             }
>>>>>>> 
>>>>>>>                             if (!isolate_lru_page(head)) {
>>>>>>> -                                     list_add_tail(&head->lru, &cma_page_list);
>>>>>>> +                                     list_add_tail(&head->lru, &movable_page_list);
>>>>>>>                                     mod_node_page_state(page_pgdat(head),
>>>>>>>                                                         NR_ISOLATED_ANON +
>>>>>>>                                                         page_is_file_lru(head),
>>>>>>> @@ -1605,7 +1604,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm,
>>>>>>>             i += step;
>>>>>>>     }
>>>>>>> 
>>>>>>> -     if (!list_empty(&cma_page_list)) {
>>>>>>> +     if (!list_empty(&movable_page_list)) {
>>>>>> 
>>>>>> You didn't answer my earlier question, is it OK that ZONE_MOVABLE
>>>>>> pages leak out here if ioslate_lru_page() fails but the
>>>>>> moval_page_list is empty?
>>>>>> 
>>>>>> I think the answer is no, right?
>>>>> In my opinion it is OK. We are doing our best to not pin movable
>>>>> pages, but if isolate_lru_page() fails because pages are currently
>>>>> locked by someone else, we will end up long-term pinning them.
>>>>> See comment in this patch:
>>>>> +        * 1. Pinned pages: (long-term) pinning of movable pages is avoided
>>>>> +        *    when pages are pinned and faulted, but it is still possible that
>>>>> +        *    address space already has pages in ZONE_MOVABLE at the time when
>>>>> +        *    pages are pinned (i.e. user has touches that memory before
>>>>> +        *    pinning). In such case we try to migrate them to a different zone,
>>>>> +        *    but if migration fails the pages can still end-up pinned in
>>>>> +        *    ZONE_MOVABLE. In such case, memory offlining might retry a long
>>>>> +        *    time and will only succeed once user application unpins pages.
>>>> 
>>>> It is not "retry a long time" it is "might never complete" because
>>>> userspace will hold the DMA pin indefinitely.
>>>> 
>>>> Confused what the point of all this is then ??
>>>> 
>>>> I thought to goal here is to make memory unplug reliable, if you leave
>>>> a hole like this then any hostile userspace can block it forever.
>>> 
>>> You are right, I used a wording from the previous comment, and it
>>> should be made clear that pin may be forever. Without these patches it
>>> is guaranteed that hot-remove will fail if there are pinned pages as
>>> ZONE_MOVABLE is actually the first to be searched. Now, it will fail
>>> only due to exceptions listed in ZONE_MOVABLE comment:
>>> 
>>> 1. pin + migration/isolation failure
>> 
>> Not sure what that really means. We have short-term pinnings (although we might have a better term for „pinning“ here) for example, when a process dies (IIRC). There is a period where pages cannot get migrated and offlining code has to retry (which might take a while). This still applies after your change - are you referring to that?
>> 
>>> 2. memblock allocation due to limited amount of space for kernelcore
>>> 3. memory holes
>>> 4. hwpoison
>>> 5. Unmovable PG_offline pages (? need to study why this is a scenario).
>> 
>> Virtio-mem is the primary user in this context.
>> 
>>> Do you think we should unconditionally unpin pages, and return error
>>> when isolation/migration fails?
>> 
>> I‘m not sure what you mean here. Who’s supposed to unpin which pages?
> 
> Hi David,
> 
> When check_and_migrate_movable_pages() is called, the pages are
> already pinned. If some of those pages are in movable zone, and we
> fail to migrate or isolate them what should we do: proceed, and keep
> it as exception of when movable zone can actually have pinned pages or
> unpin all pages in the array, and return an error, or unpin only pages
> in movable zone, and return an error?
> 

I guess revert what we did (unpin) and return an error. The interesting question is what can make migration/isolation fail

a) out of memory: smells like a zone setup issue. Failures are acceptable I guess.

b) short term pinnings: process dying - not relevant I guess. Other cases? (Fork?)

c) ?

Once we clarified that, we actually know how likely it will be to return an error (and making vfio pinnings fail etc).

> Pasha
Pasha Tatashin Dec. 11, 2020, 11 p.m. UTC | #8
> I guess revert what we did (unpin) and return an error. The interesting question is what can make migration/isolation fail

OK. I will make the necessary changes. Let's handle errors properly.
Whatever the cause for the error, we will know it when it happens, and
when error is returned. I think I will add a 10-time retry instead of
the infinite retry that we currently have. The 10-times retry we
currently have during the hot-remove path.

>
> a) out of memory: smells like a zone setup issue. Failures are acceptable I guess.
>
> b) short term pinnings: process dying - not relevant I guess. Other cases? (Fork?)
>
> c) ?
>
> Once we clarified that, we actually know how likely it will be to return an error (and making vfio pinnings fail etc).
Jason Gunthorpe Dec. 11, 2020, 11:50 p.m. UTC | #9
On Fri, Dec 11, 2020 at 10:53:00PM +0100, David Hildenbrand wrote:

> > When check_and_migrate_movable_pages() is called, the pages are
> > already pinned. If some of those pages are in movable zone, and we
> > fail to migrate or isolate them what should we do: proceed, and
> > keep it as exception of when movable zone can actually have pinned
> > pages or unpin all pages in the array, and return an error, or
> > unpin only pages in movable zone, and return an error?
> 
> I guess revert what we did (unpin) and return an error. The
> interesting question is what can make migration/isolation fail
> 
> a) out of memory: smells like a zone setup issue. Failures are acceptable I guess.

Out of memory is reasonable..
 
> b) short term pinnings: process dying - not relevant I guess. Other cases? (Fork?)

Concurrent with non-longterm GUP users are less reasonable, fork is
not reasonable, etc..

Racing with another GUP in another thread is also not reasonable, so
failing to isolate can't be a failure

Jasnon
John Hubbard Dec. 12, 2020, 12:07 a.m. UTC | #10
On 12/11/20 3:00 PM, Pavel Tatashin wrote:
>> I guess revert what we did (unpin) and return an error. The interesting question is what can make migration/isolation fail
> 
> OK. I will make the necessary changes. Let's handle errors properly.
> Whatever the cause for the error, we will know it when it happens, and
> when error is returned. I think I will add a 10-time retry instead of
> the infinite retry that we currently have. The 10-times retry we
> currently have during the hot-remove path.

It occurs to me that maybe the pre-existing infinite loop shouldn't be
there at all? Would it be better to let the callers retry? Obviously that
would be a separate patch and I'm not sure it's safe to make that change,
but the current loop seems buried maybe too far down.

Thoughts, anyone?

thanks,
David Hildenbrand Dec. 12, 2020, 7:29 a.m. UTC | #11
> Am 12.12.2020 um 00:50 schrieb Jason Gunthorpe <jgg@ziepe.ca>:
> 
> On Fri, Dec 11, 2020 at 10:53:00PM +0100, David Hildenbrand wrote:
> 
>>> When check_and_migrate_movable_pages() is called, the pages are
>>> already pinned. If some of those pages are in movable zone, and we
>>> fail to migrate or isolate them what should we do: proceed, and
>>> keep it as exception of when movable zone can actually have pinned
>>> pages or unpin all pages in the array, and return an error, or
>>> unpin only pages in movable zone, and return an error?
>> 
>> I guess revert what we did (unpin) and return an error. The
>> interesting question is what can make migration/isolation fail
>> 
>> a) out of memory: smells like a zone setup issue. Failures are acceptable I guess.
> 
> Out of memory is reasonable..
> 
>> b) short term pinnings: process dying - not relevant I guess. Other cases? (Fork?)
> 
> Concurrent with non-longterm GUP users are less reasonable, fork is
> not reasonable, etc..

Concurrent alloc_contig_range(), memory offlining, compaction .. where we migrate pages? Any experts on racing page migration in these scenarios?

(Also wondering what would happen if we are just about to swap)

> 
> Racing with another GUP in another thread is also not reasonable, so
> failing to isolate can't be a failure

Having VMs with multiple vfio containers is certainly realistic, and optimizing in user space to do vfio mappings concurrently doesn‘t sound too crazy to me. But I haven‘t checked if vfio common code already handles such concurrency.

> 
> Jasnon
>
Jason Gunthorpe Dec. 14, 2020, 1:36 p.m. UTC | #12
On Sat, Dec 12, 2020 at 08:29:11AM +0100, David Hildenbrand wrote:

> > Racing with another GUP in another thread is also not reasonable, so
> > failing to isolate can't be a failure
> 
> Having VMs with multiple vfio containers is certainly realistic, and
> optimizing in user space to do vfio mappings concurrently doesn‘t
> sound too crazy to me. But I haven‘t checked if vfio common code
> already handles such concurrency.

There is a lot more out there than vfio.. RDMA already does concurrent
pin_user_pages in real apps

Jason
Michal Hocko Dec. 14, 2020, 2:19 p.m. UTC | #13
On Fri 11-12-20 15:21:39, Pavel Tatashin wrote:
[...]
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index b593316bff3d..25c0c13ba4b1 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -386,9 +386,14 @@ enum zone_type {
>  	 * likely to succeed, and to locally limit unmovable allocations - e.g.,
>  	 * to increase the number of THP/huge pages. Notable special cases are:
>  	 *
> -	 * 1. Pinned pages: (long-term) pinning of movable pages might
> -	 *    essentially turn such pages unmovable. Memory offlining might
> -	 *    retry a long time.
> +	 * 1. Pinned pages: (long-term) pinning of movable pages is avoided
> +	 *    when pages are pinned and faulted, but it is still possible that
> +	 *    address space already has pages in ZONE_MOVABLE at the time when
> +	 *    pages are pinned (i.e. user has touches that memory before
> +	 *    pinning). In such case we try to migrate them to a different zone,
> +	 *    but if migration fails the pages can still end-up pinned in
> +	 *    ZONE_MOVABLE. In such case, memory offlining might retry a long
> +	 *    time and will only succeed once user application unpins pages.

I do agree with others in the thread. This is not really helping out the
current situation much. You should simply fail the pin rather than
pretend all is just fine.
David Hildenbrand Dec. 14, 2020, 2:21 p.m. UTC | #14
On 14.12.20 14:36, Jason Gunthorpe wrote:
> On Sat, Dec 12, 2020 at 08:29:11AM +0100, David Hildenbrand wrote:
> 
>>> Racing with another GUP in another thread is also not reasonable, so
>>> failing to isolate can't be a failure
>>
>> Having VMs with multiple vfio containers is certainly realistic, and
>> optimizing in user space to do vfio mappings concurrently doesn‘t
>> sound too crazy to me. But I haven‘t checked if vfio common code
>> already handles such concurrency.
> 
> There is a lot more out there than vfio.. RDMA already does concurrent
> pin_user_pages in real apps

I actually misread your comment. I think we both agree that temporary
isolation failures must not lead to a failure.
Michal Hocko Dec. 14, 2020, 2:30 p.m. UTC | #15
On Mon 14-12-20 15:21:21, David Hildenbrand wrote:
> On 14.12.20 14:36, Jason Gunthorpe wrote:
> > On Sat, Dec 12, 2020 at 08:29:11AM +0100, David Hildenbrand wrote:
> > 
> >>> Racing with another GUP in another thread is also not reasonable, so
> >>> failing to isolate can't be a failure
> >>
> >> Having VMs with multiple vfio containers is certainly realistic, and
> >> optimizing in user space to do vfio mappings concurrently doesn‘t
> >> sound too crazy to me. But I haven‘t checked if vfio common code
> >> already handles such concurrency.
> > 
> > There is a lot more out there than vfio.. RDMA already does concurrent
> > pin_user_pages in real apps
> 
> I actually misread your comment. I think we both agree that temporary
> isolation failures must not lead to a failure.

Yes, isolation failures are ephemeral. I believe that the migration
should start distinguishing between these and hard failures.
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838a0f7c..aae5ef0b3ba1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -27,6 +27,7 @@  enum migrate_reason {
 	MR_MEMPOLICY_MBIND,
 	MR_NUMA_MISPLACED,
 	MR_CONTIG_RANGE,
+	MR_LONGTERM_PIN,
 	MR_TYPES
 };
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..25c0c13ba4b1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -386,9 +386,14 @@  enum zone_type {
 	 * likely to succeed, and to locally limit unmovable allocations - e.g.,
 	 * to increase the number of THP/huge pages. Notable special cases are:
 	 *
-	 * 1. Pinned pages: (long-term) pinning of movable pages might
-	 *    essentially turn such pages unmovable. Memory offlining might
-	 *    retry a long time.
+	 * 1. Pinned pages: (long-term) pinning of movable pages is avoided
+	 *    when pages are pinned and faulted, but it is still possible that
+	 *    address space already has pages in ZONE_MOVABLE at the time when
+	 *    pages are pinned (i.e. user has touches that memory before
+	 *    pinning). In such case we try to migrate them to a different zone,
+	 *    but if migration fails the pages can still end-up pinned in
+	 *    ZONE_MOVABLE. In such case, memory offlining might retry a long
+	 *    time and will only succeed once user application unpins pages.
 	 * 2. memblock allocations: kernelcore/movablecore setups might create
 	 *    situations where ZONE_MOVABLE contains unmovable allocations
 	 *    after boot. Memory offlining and allocations fail early.
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 4d434398d64d..363b54ce104c 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@ 
 	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
 	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
-	EMe(MR_CONTIG_RANGE,	"contig_range")
+	EM( MR_CONTIG_RANGE,	"contig_range")			\
+	EMe(MR_LONGTERM_PIN,	"longterm_pin")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/gup.c b/mm/gup.c
index 007060e66a48..d5e9c459952e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -89,11 +89,12 @@  static __maybe_unused struct page *try_grab_compound_head(struct page *page,
 		int orig_refs = refs;
 
 		/*
-		 * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
-		 * path, so fail and let the caller fall back to the slow path.
+		 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+		 * right zone, so fail and let the caller fall back to the slow
+		 * path.
 		 */
-		if (unlikely(flags & FOLL_LONGTERM) &&
-				is_migrate_cma_page(page))
+		if (unlikely((flags & FOLL_LONGTERM) &&
+			     !is_pinnable_page(page)))
 			return NULL;
 
 		/*
@@ -1549,19 +1550,18 @@  struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-					unsigned long start,
-					unsigned long nr_pages,
-					struct page **pages,
-					struct vm_area_struct **vmas,
-					unsigned int gup_flags)
+static long check_and_migrate_movable_pages(struct mm_struct *mm,
+					    unsigned long start,
+					    unsigned long nr_pages,
+					    struct page **pages,
+					    struct vm_area_struct **vmas,
+					    unsigned int gup_flags)
 {
 	unsigned long i;
 	unsigned long step;
 	bool drain_allow = true;
 	bool migrate_allow = true;
-	LIST_HEAD(cma_page_list);
+	LIST_HEAD(movable_page_list);
 	long ret = nr_pages;
 	struct migration_target_control mtc = {
 		.nid = NUMA_NO_NODE,
@@ -1579,13 +1579,12 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 		 */
 		step = compound_nr(head) - (pages[i] - head);
 		/*
-		 * If we get a page from the CMA zone, since we are going to
-		 * be pinning these entries, we might as well move them out
-		 * of the CMA zone if possible.
+		 * If we get a movable page, since we are going to be pinning
+		 * these entries, try to move them out if possible.
 		 */
-		if (is_migrate_cma_page(head)) {
+		if (!is_pinnable_page(head)) {
 			if (PageHuge(head))
-				isolate_huge_page(head, &cma_page_list);
+				isolate_huge_page(head, &movable_page_list);
 			else {
 				if (!PageLRU(head) && drain_allow) {
 					lru_add_drain_all();
@@ -1593,7 +1592,7 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 				}
 
 				if (!isolate_lru_page(head)) {
-					list_add_tail(&head->lru, &cma_page_list);
+					list_add_tail(&head->lru, &movable_page_list);
 					mod_node_page_state(page_pgdat(head),
 							    NR_ISOLATED_ANON +
 							    page_is_file_lru(head),
@@ -1605,7 +1604,7 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 		i += step;
 	}
 
-	if (!list_empty(&cma_page_list)) {
+	if (!list_empty(&movable_page_list)) {
 		/*
 		 * drop the above get_user_pages reference.
 		 */
@@ -1615,7 +1614,7 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 			for (i = 0; i < nr_pages; i++)
 				put_page(pages[i]);
 
-		if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
+		if (migrate_pages(&movable_page_list, alloc_migration_target, NULL,
 			(unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
 			/*
 			 * some of the pages failed migration. Do get_user_pages
@@ -1623,17 +1622,16 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 			 */
 			migrate_allow = false;
 
-			if (!list_empty(&cma_page_list))
-				putback_movable_pages(&cma_page_list);
+			if (!list_empty(&movable_page_list))
+				putback_movable_pages(&movable_page_list);
 		}
 		/*
 		 * We did migrate all the pages, Try to get the page references
-		 * again migrating any new CMA pages which we failed to isolate
-		 * earlier.
+		 * again migrating any pages which we failed to isolate earlier.
 		 */
 		ret = __get_user_pages_locked(mm, start, nr_pages,
-						   pages, vmas, NULL,
-						   gup_flags);
+					      pages, vmas, NULL,
+					      gup_flags);
 
 		if ((ret > 0) && migrate_allow) {
 			nr_pages = ret;
@@ -1644,17 +1642,6 @@  static long check_and_migrate_cma_pages(struct mm_struct *mm,
 
 	return ret;
 }
-#else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-					unsigned long start,
-					unsigned long nr_pages,
-					struct page **pages,
-					struct vm_area_struct **vmas,
-					unsigned int gup_flags)
-{
-	return nr_pages;
-}
-#endif /* CONFIG_CMA */
 
 /*
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1678,8 +1665,9 @@  static long __gup_longterm_locked(struct mm_struct *mm,
 
 	if (gup_flags & FOLL_LONGTERM) {
 		if (rc > 0)
-			rc = check_and_migrate_cma_pages(mm, start, rc, pages,
-							 vmas, gup_flags);
+			rc = check_and_migrate_movable_pages(mm, start, rc,
+							     pages, vmas,
+							     gup_flags);
 		memalloc_pin_restore(flags);
 	}
 	return rc;