diff mbox series

[v3,1/4] x86/mem_sharing: reorder when pages are unlocked and released

Message ID 20190426172138.14669-1-tamas@tklengyel.com (mailing list archive)
State Superseded
Headers show
Series [v3,1/4] x86/mem_sharing: reorder when pages are unlocked and released | expand

Commit Message

Tamas K Lengyel April 26, 2019, 5:21 p.m. UTC
Calling _put_page_type while also holding the page_lock
for that page can cause a deadlock.

Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Roger Pau Monne <roger.pau@citrix.com>
---
v3: simplified patch by keeping the additional references already in-place
---
 xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

Comments

George Dunlap April 29, 2019, 2:32 p.m. UTC | #1
On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
> Calling _put_page_type while also holding the page_lock
> for that page can cause a deadlock.
> 
> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> Cc: Roger Pau Monne <roger.pau@citrix.com>
> ---
> v3: simplified patch by keeping the additional references already in-place
> ---
>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
>  1 file changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index dfc279d371..e2f74ac770 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
>          return -EBUSY;
>      }
>  
> -    /* We can only change the type if count is one */
> -    /* Because we are locking pages individually, we need to drop
> -     * the lock here, while the page is typed. We cannot risk the 
> -     * race of page_unlock and then put_page_type. */
>      expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
>      if ( page->u.inuse.type_info != expected_type )
>      {
> @@ -660,12 +656,12 @@ static int page_make_private(struct domain *d, struct page_info *page)
>          return -EEXIST;
>      }
>  
> -    /* Drop the final typecount */
> -    put_page_and_type(page);
> -
>      /* Now that we've dropped the type, we can unlock */
>      mem_sharing_page_unlock(page);
>  
> +    /* Drop the final typecount */
> +    put_page_and_type(page);
> +
>      /* Change the owner */
>      ASSERT(page_get_owner(page) == dom_cow);
>      page_set_owner(page, d);
> @@ -900,6 +896,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>      p2m_type_t smfn_type, cmfn_type;
>      struct two_gfns tg;
>      struct rmap_iterator ri;
> +    unsigned long put_count = 0;
>  
>      get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
>                   cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
> @@ -984,7 +981,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>           * Don't change the type of rmap for the client page. */
>          rmap_del(gfn, cpage, 0);
>          rmap_add(gfn, spage);
> -        put_page_and_type(cpage);
> +        put_count++;
>          d = get_domain_by_id(gfn->domain);
>          BUG_ON(!d);
>          BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>      mem_sharing_page_unlock(secondpg);
>      mem_sharing_page_unlock(firstpg);
>  
> +    BUG_ON(!put_count);
> +    while ( put_count-- )
> +        put_page_and_type(cpage);
> +
>      /* Free the client page */
>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>          put_page(cpage);
> @@ -1167,8 +1168,8 @@ int __mem_sharing_unshare_page(struct domain *d,
>      {
>          if ( !last_gfn )
>              mem_sharing_gfn_destroy(page, d, gfn_info);
> -        put_page_and_type(page);
>          mem_sharing_page_unlock(page);
> +        put_page_and_type(page);
>          if ( last_gfn )
>          {
>              if ( !get_page(page, dom_cow) )

...Probably should have mentioned that this needs to be applied after
your other patch. :-)

 -George
George Dunlap April 29, 2019, 2:46 p.m. UTC | #2
On 4/29/19 3:32 PM, George Dunlap wrote:
> On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
>> Calling _put_page_type while also holding the page_lock
>> for that page can cause a deadlock.
>>
>> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
>> Cc: Jan Beulich <jbeulich@suse.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
>> Cc: Wei Liu <wei.liu2@citrix.com>
>> Cc: Roger Pau Monne <roger.pau@citrix.com>
>> ---
>> v3: simplified patch by keeping the additional references already in-place
>> ---
>>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
>>  1 file changed, 10 insertions(+), 9 deletions(-)
>>
>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>> index dfc279d371..e2f74ac770 100644
>> --- a/xen/arch/x86/mm/mem_sharing.c
>> +++ b/xen/arch/x86/mm/mem_sharing.c
>> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>          return -EBUSY;
>>      }
>>  
>> -    /* We can only change the type if count is one */
>> -    /* Because we are locking pages individually, we need to drop
>> -     * the lock here, while the page is typed. We cannot risk the 
>> -     * race of page_unlock and then put_page_type. */
>>      expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
>>      if ( page->u.inuse.type_info != expected_type )
>>      {
>> @@ -660,12 +656,12 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>          return -EEXIST;
>>      }
>>  
>> -    /* Drop the final typecount */
>> -    put_page_and_type(page);
>> -
>>      /* Now that we've dropped the type, we can unlock */
>>      mem_sharing_page_unlock(page);
>>  
>> +    /* Drop the final typecount */
>> +    put_page_and_type(page);
>> +
>>      /* Change the owner */
>>      ASSERT(page_get_owner(page) == dom_cow);
>>      page_set_owner(page, d);
>> @@ -900,6 +896,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>      p2m_type_t smfn_type, cmfn_type;
>>      struct two_gfns tg;
>>      struct rmap_iterator ri;
>> +    unsigned long put_count = 0;
>>  
>>      get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
>>                   cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
>> @@ -984,7 +981,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>           * Don't change the type of rmap for the client page. */
>>          rmap_del(gfn, cpage, 0);
>>          rmap_add(gfn, spage);
>> -        put_page_and_type(cpage);
>> +        put_count++;
>>          d = get_domain_by_id(gfn->domain);
>>          BUG_ON(!d);
>>          BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>      mem_sharing_page_unlock(secondpg);
>>      mem_sharing_page_unlock(firstpg);
>>  
>> +    BUG_ON(!put_count);
>> +    while ( put_count-- )
>> +        put_page_and_type(cpage);
>> +
>>      /* Free the client page */
>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>>          put_page(cpage);
>> @@ -1167,8 +1168,8 @@ int __mem_sharing_unshare_page(struct domain *d,
>>      {
>>          if ( !last_gfn )
>>              mem_sharing_gfn_destroy(page, d, gfn_info);
>> -        put_page_and_type(page);
>>          mem_sharing_page_unlock(page);
>> +        put_page_and_type(page);
>>          if ( last_gfn )
>>          {
>>              if ( !get_page(page, dom_cow) )
> 
> ...Probably should have mentioned that this needs to be applied after
> your other patch. :-)

Hmm -- actually, the base appears to be a non-publicly-available tree
(Andy's private x86-next).

I think series should:
1. Always be posted against a publicly-available tree, and
2. If that tree is not xenbits/xen.git staging, the URL and branch
should be provided.

 -George
Andrew Cooper April 29, 2019, 2:49 p.m. UTC | #3
On 29/04/2019 15:46, George Dunlap wrote:
> On 4/29/19 3:32 PM, George Dunlap wrote:
>> On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
>>> Calling _put_page_type while also holding the page_lock
>>> for that page can cause a deadlock.
>>>
>>> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
>>> Cc: Jan Beulich <jbeulich@suse.com>
>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
>>> Cc: Wei Liu <wei.liu2@citrix.com>
>>> Cc: Roger Pau Monne <roger.pau@citrix.com>
>>> ---
>>> v3: simplified patch by keeping the additional references already in-place
>>> ---
>>>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
>>>  1 file changed, 10 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>>> index dfc279d371..e2f74ac770 100644
>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>>          return -EBUSY;
>>>      }
>>>  
>>> -    /* We can only change the type if count is one */
>>> -    /* Because we are locking pages individually, we need to drop
>>> -     * the lock here, while the page is typed. We cannot risk the 
>>> -     * race of page_unlock and then put_page_type. */
>>>      expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
>>>      if ( page->u.inuse.type_info != expected_type )
>>>      {
>>> @@ -660,12 +656,12 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>>          return -EEXIST;
>>>      }
>>>  
>>> -    /* Drop the final typecount */
>>> -    put_page_and_type(page);
>>> -
>>>      /* Now that we've dropped the type, we can unlock */
>>>      mem_sharing_page_unlock(page);
>>>  
>>> +    /* Drop the final typecount */
>>> +    put_page_and_type(page);
>>> +
>>>      /* Change the owner */
>>>      ASSERT(page_get_owner(page) == dom_cow);
>>>      page_set_owner(page, d);
>>> @@ -900,6 +896,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>      p2m_type_t smfn_type, cmfn_type;
>>>      struct two_gfns tg;
>>>      struct rmap_iterator ri;
>>> +    unsigned long put_count = 0;
>>>  
>>>      get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
>>>                   cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
>>> @@ -984,7 +981,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>           * Don't change the type of rmap for the client page. */
>>>          rmap_del(gfn, cpage, 0);
>>>          rmap_add(gfn, spage);
>>> -        put_page_and_type(cpage);
>>> +        put_count++;
>>>          d = get_domain_by_id(gfn->domain);
>>>          BUG_ON(!d);
>>>          BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
>>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>      mem_sharing_page_unlock(secondpg);
>>>      mem_sharing_page_unlock(firstpg);
>>>  
>>> +    BUG_ON(!put_count);
>>> +    while ( put_count-- )
>>> +        put_page_and_type(cpage);
>>> +
>>>      /* Free the client page */
>>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>>>          put_page(cpage);
>>> @@ -1167,8 +1168,8 @@ int __mem_sharing_unshare_page(struct domain *d,
>>>      {
>>>          if ( !last_gfn )
>>>              mem_sharing_gfn_destroy(page, d, gfn_info);
>>> -        put_page_and_type(page);
>>>          mem_sharing_page_unlock(page);
>>> +        put_page_and_type(page);
>>>          if ( last_gfn )
>>>          {
>>>              if ( !get_page(page, dom_cow) )
>> ...Probably should have mentioned that this needs to be applied after
>> your other patch. :-)
> Hmm -- actually, the base appears to be a non-publicly-available tree
> (Andy's private x86-next).

Its perfectly public

http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=shortlog;h=refs/heads/x86-next

and has been in effect during commit freezes for the past several years
worth of releases.

It is fine to post a tree based on something other than xen.git/staging
so long as the cover letter identifies where the series is based.  A lot
of development already occurs in this way on xen-devel.

~Andrew
George Dunlap April 29, 2019, 2:54 p.m. UTC | #4
On 4/29/19 3:49 PM, Andrew Cooper wrote:
> On 29/04/2019 15:46, George Dunlap wrote:
>> On 4/29/19 3:32 PM, George Dunlap wrote:
>>> On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
>>>> Calling _put_page_type while also holding the page_lock
>>>> for that page can cause a deadlock.
>>>>
>>>> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
>>>> Cc: Jan Beulich <jbeulich@suse.com>
>>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
>>>> Cc: Wei Liu <wei.liu2@citrix.com>
>>>> Cc: Roger Pau Monne <roger.pau@citrix.com>
>>>> ---
>>>> v3: simplified patch by keeping the additional references already in-place
>>>> ---
>>>>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
>>>>  1 file changed, 10 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>>>> index dfc279d371..e2f74ac770 100644
>>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>>> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>>>          return -EBUSY;
>>>>      }
>>>>  
>>>> -    /* We can only change the type if count is one */
>>>> -    /* Because we are locking pages individually, we need to drop
>>>> -     * the lock here, while the page is typed. We cannot risk the 
>>>> -     * race of page_unlock and then put_page_type. */
>>>>      expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
>>>>      if ( page->u.inuse.type_info != expected_type )
>>>>      {
>>>> @@ -660,12 +656,12 @@ static int page_make_private(struct domain *d, struct page_info *page)
>>>>          return -EEXIST;
>>>>      }
>>>>  
>>>> -    /* Drop the final typecount */
>>>> -    put_page_and_type(page);
>>>> -
>>>>      /* Now that we've dropped the type, we can unlock */
>>>>      mem_sharing_page_unlock(page);
>>>>  
>>>> +    /* Drop the final typecount */
>>>> +    put_page_and_type(page);
>>>> +
>>>>      /* Change the owner */
>>>>      ASSERT(page_get_owner(page) == dom_cow);
>>>>      page_set_owner(page, d);
>>>> @@ -900,6 +896,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>>      p2m_type_t smfn_type, cmfn_type;
>>>>      struct two_gfns tg;
>>>>      struct rmap_iterator ri;
>>>> +    unsigned long put_count = 0;
>>>>  
>>>>      get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
>>>>                   cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
>>>> @@ -984,7 +981,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>>           * Don't change the type of rmap for the client page. */
>>>>          rmap_del(gfn, cpage, 0);
>>>>          rmap_add(gfn, spage);
>>>> -        put_page_and_type(cpage);
>>>> +        put_count++;
>>>>          d = get_domain_by_id(gfn->domain);
>>>>          BUG_ON(!d);
>>>>          BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
>>>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>>      mem_sharing_page_unlock(secondpg);
>>>>      mem_sharing_page_unlock(firstpg);
>>>>  
>>>> +    BUG_ON(!put_count);
>>>> +    while ( put_count-- )
>>>> +        put_page_and_type(cpage);
>>>> +
>>>>      /* Free the client page */
>>>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>>>>          put_page(cpage);
>>>> @@ -1167,8 +1168,8 @@ int __mem_sharing_unshare_page(struct domain *d,
>>>>      {
>>>>          if ( !last_gfn )
>>>>              mem_sharing_gfn_destroy(page, d, gfn_info);
>>>> -        put_page_and_type(page);
>>>>          mem_sharing_page_unlock(page);
>>>> +        put_page_and_type(page);
>>>>          if ( last_gfn )
>>>>          {
>>>>              if ( !get_page(page, dom_cow) )
>>> ...Probably should have mentioned that this needs to be applied after
>>> your other patch. :-)
>> Hmm -- actually, the base appears to be a non-publicly-available tree
>> (Andy's private x86-next).
> 
> Its perfectly public
> 
> http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=shortlog;h=refs/heads/x86-next

OK, didn't realize this.

> 
> and has been in effect during commit freezes for the past several years
> worth of releases.
> 
> It is fine to post a tree based on something other than xen.git/staging
> so long as the cover letter identifies where the series is based.  A lot
> of development already occurs in this way on xen-devel.

I agree it's fine when a URL is provided.

 -George
Jan Beulich April 29, 2019, 3:01 p.m. UTC | #5
>>> On 26.04.19 at 19:21, <tamas@tklengyel.com> wrote:
> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>      mem_sharing_page_unlock(secondpg);
>      mem_sharing_page_unlock(firstpg);
>  
> +    BUG_ON(!put_count);
> +    while ( put_count-- )
> +        put_page_and_type(cpage);
> +
>      /* Free the client page */
>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>          put_page(cpage);

If this was to happen before all the put_page_and_type() calls,
wouldn't it render unnecessary the extra get_page()/put_page()
around this code region?

Jan
Tamas K Lengyel April 29, 2019, 3:37 p.m. UTC | #6
On Mon, Apr 29, 2019 at 8:54 AM George Dunlap <george.dunlap@citrix.com> wrote:
>
> On 4/29/19 3:49 PM, Andrew Cooper wrote:
> > On 29/04/2019 15:46, George Dunlap wrote:
> >> On 4/29/19 3:32 PM, George Dunlap wrote:
> >>> On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
> >>>> Calling _put_page_type while also holding the page_lock
> >>>> for that page can cause a deadlock.
> >>>>
> >>>> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
> >>>> Cc: Jan Beulich <jbeulich@suse.com>
> >>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> >>>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> >>>> Cc: Wei Liu <wei.liu2@citrix.com>
> >>>> Cc: Roger Pau Monne <roger.pau@citrix.com>
> >>>> ---
> >>>> v3: simplified patch by keeping the additional references already in-place
> >>>> ---
> >>>>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
> >>>>  1 file changed, 10 insertions(+), 9 deletions(-)
> >>>>
> >>>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> >>>> index dfc279d371..e2f74ac770 100644
> >>>> --- a/xen/arch/x86/mm/mem_sharing.c
> >>>> +++ b/xen/arch/x86/mm/mem_sharing.c
> >>>> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
> >>>>          return -EBUSY;
> >>>>      }
> >>>>
> >>>> -    /* We can only change the type if count is one */
> >>>> -    /* Because we are locking pages individually, we need to drop
> >>>> -     * the lock here, while the page is typed. We cannot risk the
> >>>> -     * race of page_unlock and then put_page_type. */
> >>>>      expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
> >>>>      if ( page->u.inuse.type_info != expected_type )
> >>>>      {
> >>>> @@ -660,12 +656,12 @@ static int page_make_private(struct domain *d, struct page_info *page)
> >>>>          return -EEXIST;
> >>>>      }
> >>>>
> >>>> -    /* Drop the final typecount */
> >>>> -    put_page_and_type(page);
> >>>> -
> >>>>      /* Now that we've dropped the type, we can unlock */
> >>>>      mem_sharing_page_unlock(page);
> >>>>
> >>>> +    /* Drop the final typecount */
> >>>> +    put_page_and_type(page);
> >>>> +
> >>>>      /* Change the owner */
> >>>>      ASSERT(page_get_owner(page) == dom_cow);
> >>>>      page_set_owner(page, d);
> >>>> @@ -900,6 +896,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
> >>>>      p2m_type_t smfn_type, cmfn_type;
> >>>>      struct two_gfns tg;
> >>>>      struct rmap_iterator ri;
> >>>> +    unsigned long put_count = 0;
> >>>>
> >>>>      get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
> >>>>                   cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
> >>>> @@ -984,7 +981,7 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
> >>>>           * Don't change the type of rmap for the client page. */
> >>>>          rmap_del(gfn, cpage, 0);
> >>>>          rmap_add(gfn, spage);
> >>>> -        put_page_and_type(cpage);
> >>>> +        put_count++;
> >>>>          d = get_domain_by_id(gfn->domain);
> >>>>          BUG_ON(!d);
> >>>>          BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
> >>>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
> >>>>      mem_sharing_page_unlock(secondpg);
> >>>>      mem_sharing_page_unlock(firstpg);
> >>>>
> >>>> +    BUG_ON(!put_count);
> >>>> +    while ( put_count-- )
> >>>> +        put_page_and_type(cpage);
> >>>> +
> >>>>      /* Free the client page */
> >>>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
> >>>>          put_page(cpage);
> >>>> @@ -1167,8 +1168,8 @@ int __mem_sharing_unshare_page(struct domain *d,
> >>>>      {
> >>>>          if ( !last_gfn )
> >>>>              mem_sharing_gfn_destroy(page, d, gfn_info);
> >>>> -        put_page_and_type(page);
> >>>>          mem_sharing_page_unlock(page);
> >>>> +        put_page_and_type(page);
> >>>>          if ( last_gfn )
> >>>>          {
> >>>>              if ( !get_page(page, dom_cow) )
> >>> ...Probably should have mentioned that this needs to be applied after
> >>> your other patch. :-)
> >> Hmm -- actually, the base appears to be a non-publicly-available tree
> >> (Andy's private x86-next).
> >
> > Its perfectly public
> >
> > http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=shortlog;h=refs/heads/x86-next
>
> OK, didn't realize this.
>
> >
> > and has been in effect during commit freezes for the past several years
> > worth of releases.
> >
> > It is fine to post a tree based on something other than xen.git/staging
> > so long as the cover letter identifies where the series is based.  A lot
> > of development already occurs in this way on xen-devel.
>
> I agree it's fine when a URL is provided.
>

Yeap, sorry, should have pointed out I rebased on x86-next so I don't
keep sending patches that are already pulled in.

Tamas
Tamas K Lengyel April 29, 2019, 3:41 p.m. UTC | #7
On Mon, Apr 29, 2019 at 9:01 AM Jan Beulich <JBeulich@suse.com> wrote:
>
> >>> On 26.04.19 at 19:21, <tamas@tklengyel.com> wrote:
> > @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
> >      mem_sharing_page_unlock(secondpg);
> >      mem_sharing_page_unlock(firstpg);
> >
> > +    BUG_ON(!put_count);
> > +    while ( put_count-- )
> > +        put_page_and_type(cpage);
> > +
> >      /* Free the client page */
> >      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
> >          put_page(cpage);
>
> If this was to happen before all the put_page_and_type() calls,
> wouldn't it render unnecessary the extra get_page()/put_page()
> around this code region?

It would - I already sent a version of the patch in that form but
there was unease expressed by George going that route because of the
complexity of the code and in his view it's the safe bet to keep the
extra references. I think the overhead of grabbing the extra
references is negligible enough that I'm not going to argue over it.

Tamas
George Dunlap April 29, 2019, 3:44 p.m. UTC | #8
On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
> Calling _put_page_type while also holding the page_lock
> for that page can cause a deadlock.

I can't immediately see what the mem_sharing_page_[un]lock() functions
are meant to be protecting against.  Is there a comment anywhere
describing what they're used for or how they work?

Because...

> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> Cc: Roger Pau Monne <roger.pau@citrix.com>
> ---
> v3: simplified patch by keeping the additional references already in-place
> ---
>  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
>  1 file changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index dfc279d371..e2f74ac770 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
>          return -EBUSY;
>      }
>  
> -    /* We can only change the type if count is one */
> -    /* Because we are locking pages individually, we need to drop
> -     * the lock here, while the page is typed. We cannot risk the 
> -     * race of page_unlock and then put_page_type. */

...the comment you're removing explicitly says that what you're doing is
"risk"-y.; but you don't explain at all why the comment is wrong.

Ultimately you're the maintainer, and this is non-security-supported, so
if you insist it's safe, I won't oppose it; but it seems like having
some clarity about what is or is not risky and why would be helpful.

 -George
George Dunlap April 29, 2019, 3:52 p.m. UTC | #9
On 4/29/19 4:41 PM, Tamas K Lengyel wrote:
> On Mon, Apr 29, 2019 at 9:01 AM Jan Beulich <JBeulich@suse.com> wrote:
>>
>>>>> On 26.04.19 at 19:21, <tamas@tklengyel.com> wrote:
>>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
>>>      mem_sharing_page_unlock(secondpg);
>>>      mem_sharing_page_unlock(firstpg);
>>>
>>> +    BUG_ON(!put_count);
>>> +    while ( put_count-- )
>>> +        put_page_and_type(cpage);
>>> +
>>>      /* Free the client page */
>>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
>>>          put_page(cpage);
>>
>> If this was to happen before all the put_page_and_type() calls,
>> wouldn't it render unnecessary the extra get_page()/put_page()
>> around this code region?
> 
> It would - I already sent a version of the patch in that form but
> there was unease expressed by George going that route because of the
> complexity of the code and in his view it's the safe bet to keep the
> extra references. I think the overhead of grabbing the extra
> references is negligible enough that I'm not going to argue over it.

Er, that's not what I said. :-)

I gave four possible options, *one* of which was to change the ASSERT()
to a BUG_ON(), and *one* of which was to keep the (probably redundant)
get_page() and put_page() calls.  You appear to have done both. :-)

I haven't re-grokked the code here, but assuming I was correct 2 weeks
ago, if you have the BUG_ON() there, you can get rid of the extra
references.

 -George
Tamas K Lengyel April 29, 2019, 3:58 p.m. UTC | #10
On Mon, Apr 29, 2019 at 9:44 AM George Dunlap <george.dunlap@citrix.com> wrote:
>
> On 4/26/19 6:21 PM, Tamas K Lengyel wrote:
> > Calling _put_page_type while also holding the page_lock
> > for that page can cause a deadlock.
>
> I can't immediately see what the mem_sharing_page_[un]lock() functions
> are meant to be protecting against.  Is there a comment anywhere
> describing what they're used for or how they work?

There are none. The whole subsystem and its of page_lock/unlock use is
undocumented. The lock is used to protect the page_sharing_info
structure from simultaneous updates (when the page is shared/unshared)
and also to make freeing that structure safe.

>
> Because...
>
> > Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
> > Cc: Jan Beulich <jbeulich@suse.com>
> > Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> > Cc: George Dunlap <george.dunlap@eu.citrix.com>
> > Cc: Wei Liu <wei.liu2@citrix.com>
> > Cc: Roger Pau Monne <roger.pau@citrix.com>
> > ---
> > v3: simplified patch by keeping the additional references already in-place
> > ---
> >  xen/arch/x86/mm/mem_sharing.c | 19 ++++++++++---------
> >  1 file changed, 10 insertions(+), 9 deletions(-)
> >
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index dfc279d371..e2f74ac770 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -648,10 +648,6 @@ static int page_make_private(struct domain *d, struct page_info *page)
> >          return -EBUSY;
> >      }
> >
> > -    /* We can only change the type if count is one */
> > -    /* Because we are locking pages individually, we need to drop
> > -     * the lock here, while the page is typed. We cannot risk the
> > -     * race of page_unlock and then put_page_type. */
>
> ...the comment you're removing explicitly says that what you're doing is
> "risk"-y.; but you don't explain at all why the comment is wrong.

AFAICT that comment was correct when it was added but since then it is
out-of-date. There is now an explicit switch case added to
_put_page_type that seem to handle this situation (case PGT_locked |
1) that was not there when this comment was introduced. That's what I
was able to decipher using git archeology. IMHO the whole
page_lock/put_type and reference counting is undecipherable (at least
I'm no closer to understanding it after staring at this for 2 weeks
now) and I wish there was a way to just getting rid of the whole thing
but unfortunately there doesn't seem to be one due to not being able
to easily grow page_info.

>
> Ultimately you're the maintainer, and this is non-security-supported, so
> if you insist it's safe, I won't oppose it; but it seems like having
> some clarity about what is or is not risky and why would be helpful.

I'm just trying to revive the subsystem as the way things are right
now it's broken. Doing this reorder makes the deadlock go away.

Tamas
Tamas K Lengyel April 29, 2019, 4:05 p.m. UTC | #11
On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
>
> On 4/29/19 4:41 PM, Tamas K Lengyel wrote:
> > On Mon, Apr 29, 2019 at 9:01 AM Jan Beulich <JBeulich@suse.com> wrote:
> >>
> >>>>> On 26.04.19 at 19:21, <tamas@tklengyel.com> wrote:
> >>> @@ -999,6 +996,10 @@ static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
> >>>      mem_sharing_page_unlock(secondpg);
> >>>      mem_sharing_page_unlock(firstpg);
> >>>
> >>> +    BUG_ON(!put_count);
> >>> +    while ( put_count-- )
> >>> +        put_page_and_type(cpage);
> >>> +
> >>>      /* Free the client page */
> >>>      if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
> >>>          put_page(cpage);
> >>
> >> If this was to happen before all the put_page_and_type() calls,
> >> wouldn't it render unnecessary the extra get_page()/put_page()
> >> around this code region?
> >
> > It would - I already sent a version of the patch in that form but
> > there was unease expressed by George going that route because of the
> > complexity of the code and in his view it's the safe bet to keep the
> > extra references. I think the overhead of grabbing the extra
> > references is negligible enough that I'm not going to argue over it.
>
> Er, that's not what I said. :-)
>
> I gave four possible options, *one* of which was to change the ASSERT()
> to a BUG_ON(), and *one* of which was to keep the (probably redundant)
> get_page() and put_page() calls.  You appear to have done both. :-)

Ah indeed - I've combined those recommendations :)

>
> I haven't re-grokked the code here, but assuming I was correct 2 weeks
> ago, if you have the BUG_ON() there, you can get rid of the extra
> references.
>

Sure, but again, the overhead of having them in-place is negligible so
might as well just keep it.

Tamas
Jan Beulich April 29, 2019, 4:14 p.m. UTC | #12
>>> On 29.04.19 at 18:05, <tamas@tklengyel.com> wrote:
> On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
>> I haven't re-grokked the code here, but assuming I was correct 2 weeks
>> ago, if you have the BUG_ON() there, you can get rid of the extra
>> references.
> 
> Sure, but again, the overhead of having them in-place is negligible so
> might as well just keep it.

The overhead is only one aspect here. People looking at the code
may also be mislead into trying to figure out why the heck this
extra reference gets obtained. Plus sub-optimal code tends to get
cloned ...

Jan
George Dunlap April 29, 2019, 4:22 p.m. UTC | #13
On 4/29/19 5:14 PM, Jan Beulich wrote:
>>>> On 29.04.19 at 18:05, <tamas@tklengyel.com> wrote:
>> On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
>>> I haven't re-grokked the code here, but assuming I was correct 2 weeks
>>> ago, if you have the BUG_ON() there, you can get rid of the extra
>>> references.
>>
>> Sure, but again, the overhead of having them in-place is negligible so
>> might as well just keep it.
> 
> The overhead is only one aspect here. People looking at the code
> may also be mislead into trying to figure out why the heck this
> extra reference gets obtained. Plus sub-optimal code tends to get
> cloned ...

I think it's better not to have duplicate measures as well, but I don't
want to argue too much over it.

At very least, there should be a comment saying that the get_page()
shouldn't be necessary.

 -George
Tamas K Lengyel April 29, 2019, 4:26 p.m. UTC | #14
On Mon, Apr 29, 2019 at 10:14 AM Jan Beulich <JBeulich@suse.com> wrote:
>
> >>> On 29.04.19 at 18:05, <tamas@tklengyel.com> wrote:
> > On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
> >> I haven't re-grokked the code here, but assuming I was correct 2 weeks
> >> ago, if you have the BUG_ON() there, you can get rid of the extra
> >> references.
> >
> > Sure, but again, the overhead of having them in-place is negligible so
> > might as well just keep it.
>
> The overhead is only one aspect here. People looking at the code
> may also be mislead into trying to figure out why the heck this
> extra reference gets obtained. Plus sub-optimal code tends to get
> cloned ...

Yea, I'm with you.. Alright, in that case Andrew pulled in that
previous patch into x86-next for no good reason as that whole thing is
going to get dropped now. Andrew - if you can just drop that patch
from x86-next I'll rebase on staging and resend without that patch.

Thanks,
Tamas
George Dunlap April 29, 2019, 4:29 p.m. UTC | #15
On 4/29/19 5:26 PM, Tamas K Lengyel wrote:
> On Mon, Apr 29, 2019 at 10:14 AM Jan Beulich <JBeulich@suse.com> wrote:
>>
>>>>> On 29.04.19 at 18:05, <tamas@tklengyel.com> wrote:
>>> On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
>>>> I haven't re-grokked the code here, but assuming I was correct 2 weeks
>>>> ago, if you have the BUG_ON() there, you can get rid of the extra
>>>> references.
>>>
>>> Sure, but again, the overhead of having them in-place is negligible so
>>> might as well just keep it.
>>
>> The overhead is only one aspect here. People looking at the code
>> may also be mislead into trying to figure out why the heck this
>> extra reference gets obtained. Plus sub-optimal code tends to get
>> cloned ...
> 
> Yea, I'm with you.. Alright, in that case Andrew pulled in that
> previous patch into x86-next for no good reason as that whole thing is
> going to get dropped now. Andrew - if you can just drop that patch
> from x86-next I'll rebase on staging and resend without that patch.

I assume he wants that branch to be fast-forwarding; if so, he can't
really pull it out.

 -George
Tamas K Lengyel April 29, 2019, 4:36 p.m. UTC | #16
On Mon, Apr 29, 2019 at 10:29 AM George Dunlap <george.dunlap@citrix.com> wrote:
>
> On 4/29/19 5:26 PM, Tamas K Lengyel wrote:
> > On Mon, Apr 29, 2019 at 10:14 AM Jan Beulich <JBeulich@suse.com> wrote:
> >>
> >>>>> On 29.04.19 at 18:05, <tamas@tklengyel.com> wrote:
> >>> On Mon, Apr 29, 2019 at 9:52 AM George Dunlap <george.dunlap@citrix.com> wrote:
> >>>> I haven't re-grokked the code here, but assuming I was correct 2 weeks
> >>>> ago, if you have the BUG_ON() there, you can get rid of the extra
> >>>> references.
> >>>
> >>> Sure, but again, the overhead of having them in-place is negligible so
> >>> might as well just keep it.
> >>
> >> The overhead is only one aspect here. People looking at the code
> >> may also be mislead into trying to figure out why the heck this
> >> extra reference gets obtained. Plus sub-optimal code tends to get
> >> cloned ...
> >
> > Yea, I'm with you.. Alright, in that case Andrew pulled in that
> > previous patch into x86-next for no good reason as that whole thing is
> > going to get dropped now. Andrew - if you can just drop that patch
> > from x86-next I'll rebase on staging and resend without that patch.
>
> I assume he wants that branch to be fast-forwarding; if so, he can't
> really pull it out.
>

Yea, figured.. Oh well, not really a big deal.

Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index dfc279d371..e2f74ac770 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -648,10 +648,6 @@  static int page_make_private(struct domain *d, struct page_info *page)
         return -EBUSY;
     }
 
-    /* We can only change the type if count is one */
-    /* Because we are locking pages individually, we need to drop
-     * the lock here, while the page is typed. We cannot risk the 
-     * race of page_unlock and then put_page_type. */
     expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
     if ( page->u.inuse.type_info != expected_type )
     {
@@ -660,12 +656,12 @@  static int page_make_private(struct domain *d, struct page_info *page)
         return -EEXIST;
     }
 
-    /* Drop the final typecount */
-    put_page_and_type(page);
-
     /* Now that we've dropped the type, we can unlock */
     mem_sharing_page_unlock(page);
 
+    /* Drop the final typecount */
+    put_page_and_type(page);
+
     /* Change the owner */
     ASSERT(page_get_owner(page) == dom_cow);
     page_set_owner(page, d);
@@ -900,6 +896,7 @@  static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
     p2m_type_t smfn_type, cmfn_type;
     struct two_gfns tg;
     struct rmap_iterator ri;
+    unsigned long put_count = 0;
 
     get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
                  cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg);
@@ -984,7 +981,7 @@  static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
          * Don't change the type of rmap for the client page. */
         rmap_del(gfn, cpage, 0);
         rmap_add(gfn, spage);
-        put_page_and_type(cpage);
+        put_count++;
         d = get_domain_by_id(gfn->domain);
         BUG_ON(!d);
         BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
@@ -999,6 +996,10 @@  static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
     mem_sharing_page_unlock(secondpg);
     mem_sharing_page_unlock(firstpg);
 
+    BUG_ON(!put_count);
+    while ( put_count-- )
+        put_page_and_type(cpage);
+
     /* Free the client page */
     if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
         put_page(cpage);
@@ -1167,8 +1168,8 @@  int __mem_sharing_unshare_page(struct domain *d,
     {
         if ( !last_gfn )
             mem_sharing_gfn_destroy(page, d, gfn_info);
-        put_page_and_type(page);
         mem_sharing_page_unlock(page);
+        put_page_and_type(page);
         if ( last_gfn )
         {
             if ( !get_page(page, dom_cow) )