diff mbox series

[v8,03/15] x86/mm: rewrite virt_to_xen_l*e

Message ID e7963f6d8cab8e4d5d4249b12a8175405d888bba.1595857947.git.hongyxia@amazon.com (mailing list archive)
State New, archived
Headers show
Series switch to domheap for Xen page tables | expand

Commit Message

Hongyan Xia July 27, 2020, 2:21 p.m. UTC
From: Wei Liu <wei.liu2@citrix.com>

Rewrite those functions to use the new APIs. Modify its callers to unmap
the pointer returned. Since alloc_xen_pagetable_new() is almost never
useful unless accompanied by page clearing and a mapping, introduce a
helper alloc_map_clear_xen_pt() for this sequence.

Note that the change of virt_to_xen_l1e() also requires vmap_to_mfn() to
unmap the page, which requires domain_page.h header in vmap.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

---
Changed in v8:
- s/virtual address/linear address/.
- BUG_ON() on NULL return in vmap_to_mfn().

Changed in v7:
- remove a comment.
- use l1e_get_mfn() instead of converting things back and forth.
- add alloc_map_clear_xen_pt().
- unmap before the next mapping to reduce mapcache pressure.
- use normal unmap calls instead of the macro in error paths because
  unmap can handle NULL now.
---
 xen/arch/x86/domain_page.c | 11 ++++--
 xen/arch/x86/mm.c          | 96 +++++++++++++++++++++++++++++++++-------------
 xen/common/vmap.c          |  1 +
 xen/include/asm-x86/mm.h   |  1 +
 xen/include/asm-x86/page.h | 10 ++++-
 5 files changed, 88 insertions(+), 31 deletions(-)

Comments

Jan Beulich Aug. 7, 2020, 2:05 p.m. UTC | #1
On 27.07.2020 16:21, Hongyan Xia wrote:
> From: Wei Liu <wei.liu2@citrix.com>
> 
> Rewrite those functions to use the new APIs. Modify its callers to unmap
> the pointer returned. Since alloc_xen_pagetable_new() is almost never
> useful unless accompanied by page clearing and a mapping, introduce a
> helper alloc_map_clear_xen_pt() for this sequence.
> 
> Note that the change of virt_to_xen_l1e() also requires vmap_to_mfn() to
> unmap the page, which requires domain_page.h header in vmap.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> 
> ---
> Changed in v8:
> - s/virtual address/linear address/.
> - BUG_ON() on NULL return in vmap_to_mfn().

The justification for this should be recorded in the description. In
reply to v7 I did even suggest how to easily address the issue you
did notice with large pages, as well as alternative behavior for
vmap_to_mfn().

> --- a/xen/include/asm-x86/page.h
> +++ b/xen/include/asm-x86/page.h
> @@ -291,7 +291,15 @@ void copy_page_sse2(void *, const void *);
>  #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
>  #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
>  #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
> -#define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
> +
> +#define vmap_to_mfn(va) ({                                                  \
> +        const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned long)(va));   \
> +        mfn_t mfn_;                                                         \
> +        BUG_ON(!pl1e_);                                                     \
> +        mfn_ = l1e_get_mfn(*pl1e_);                                         \
> +        unmap_domain_page(pl1e_);                                           \
> +        mfn_; })

Additionally - no idea why I only notice this now, this wants some
further formatting adjustment: Either

#define vmap_to_mfn(va) ({                                                \
        const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned long)(va)); \
        mfn_t mfn_;                                                       \
        BUG_ON(!pl1e_);                                                   \
        mfn_ = l1e_get_mfn(*pl1e_);                                       \
        unmap_domain_page(pl1e_);                                         \
        mfn_;                                                             \
    })

or (preferably imo)

#define vmap_to_mfn(va) ({                                            \
    const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned long)(va)); \
    mfn_t mfn_;                                                       \
    BUG_ON(!pl1e_);                                                   \
    mfn_ = l1e_get_mfn(*pl1e_);                                       \
    unmap_domain_page(pl1e_);                                         \
    mfn_;                                                             \
})

Jan
Hongyan Xia Aug. 13, 2020, 4:08 p.m. UTC | #2
On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
> On 27.07.2020 16:21, Hongyan Xia wrote:
> > From: Wei Liu <wei.liu2@citrix.com>
> > 
> > Rewrite those functions to use the new APIs. Modify its callers to
> > unmap
> > the pointer returned. Since alloc_xen_pagetable_new() is almost
> > never
> > useful unless accompanied by page clearing and a mapping, introduce
> > a
> > helper alloc_map_clear_xen_pt() for this sequence.
> > 
> > Note that the change of virt_to_xen_l1e() also requires
> > vmap_to_mfn() to
> > unmap the page, which requires domain_page.h header in vmap.
> > 
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> > Reviewed-by: Jan Beulich <jbeulich@suse.com>
> > 
> > ---
> > Changed in v8:
> > - s/virtual address/linear address/.
> > - BUG_ON() on NULL return in vmap_to_mfn().
> 
> The justification for this should be recorded in the description. In

Will do.

> reply to v7 I did even suggest how to easily address the issue you
> did notice with large pages, as well as alternative behavior for
> vmap_to_mfn().

One thing about adding SMALL_PAGES is that vmap is common code and I am
not sure if the Arm side is happy with it.

> 
> > --- a/xen/include/asm-x86/page.h
> > +++ b/xen/include/asm-x86/page.h
> > @@ -291,7 +291,15 @@ void copy_page_sse2(void *, const void *);
> >  #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
> >  #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
> >  #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
> > -#define
> > vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned
> > long)(va))))
> > +
> > +#define vmap_to_mfn(va)
> > ({                                                  \
> > +        const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned
> > long)(va));   \
> > +        mfn_t
> > mfn_;                                                         \
> > +        BUG_ON(!pl1e_);                                           
> >           \
> > +        mfn_ =
> > l1e_get_mfn(*pl1e_);                                         \
> > +        unmap_domain_page(pl1e_);                                 
> >           \
> > +        mfn_; })
> 
> Additionally - no idea why I only notice this now, this wants some
> further formatting adjustment: Either
> 
> #define vmap_to_mfn(va)
> ({                                                \
>         const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned
> long)(va)); \
>         mfn_t
> mfn_;                                                       \
>         BUG_ON(!pl1e_);                                              
>      \
>         mfn_ =
> l1e_get_mfn(*pl1e_);                                       \
>         unmap_domain_page(pl1e_);                                    
>      \
>         mfn_;                                                        
>      \
>     })
> 
> or (preferably imo)
> 
> #define vmap_to_mfn(va)
> ({                                            \
>     const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned long)(va));
> \
>     mfn_t
> mfn_;                                                       \
>     BUG_ON(!pl1e_);                                                  
>  \
>     mfn_ =
> l1e_get_mfn(*pl1e_);                                       \
>     unmap_domain_page(pl1e_);                                        
>  \
>     mfn_;                                                            
>  \
> })

Will do so in the next rev.

Hongyan
Julien Grall Aug. 13, 2020, 5:22 p.m. UTC | #3
Hi,

On 13/08/2020 17:08, Hongyan Xia wrote:
> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>> From: Wei Liu <wei.liu2@citrix.com>
>>>
>>> Rewrite those functions to use the new APIs. Modify its callers to
>>> unmap
>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>> never
>>> useful unless accompanied by page clearing and a mapping, introduce
>>> a
>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>
>>> Note that the change of virt_to_xen_l1e() also requires
>>> vmap_to_mfn() to
>>> unmap the page, which requires domain_page.h header in vmap.
>>>
>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> ---
>>> Changed in v8:
>>> - s/virtual address/linear address/.
>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>
>> The justification for this should be recorded in the description. In
> 
> Will do.
> 
>> reply to v7 I did even suggest how to easily address the issue you
>> did notice with large pages, as well as alternative behavior for
>> vmap_to_mfn().
> 
> One thing about adding SMALL_PAGES is that vmap is common code and I am
> not sure if the Arm side is happy with it.

At the moment, Arm is only using small mapping but I plan to change that 
soon because we have regions that can be fairly big.

Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So 
I don't particularly like the idea to expose such trick in common code.

Even on x86, I think this is not the right approach. Such band-aid will 
impact the performance as, assuming superpages are used, it will take 
longer to map and add pressure on the TLBs.

I am aware that superpages will be useful for LiveUpdate, but is there 
any use cases in upstream? If not, could we just use the BUG_ON() and 
implement correctly vmap_to_mfn() in a follow-up?

Cheers,
Jan Beulich Aug. 18, 2020, 8:49 a.m. UTC | #4
On 13.08.2020 19:22, Julien Grall wrote:
> Hi,
> 
> On 13/08/2020 17:08, Hongyan Xia wrote:
>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>
>>>> Rewrite those functions to use the new APIs. Modify its callers to
>>>> unmap
>>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>>> never
>>>> useful unless accompanied by page clearing and a mapping, introduce
>>>> a
>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>
>>>> Note that the change of virt_to_xen_l1e() also requires
>>>> vmap_to_mfn() to
>>>> unmap the page, which requires domain_page.h header in vmap.
>>>>
>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>
>>>> ---
>>>> Changed in v8:
>>>> - s/virtual address/linear address/.
>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>
>>> The justification for this should be recorded in the description. In
>>
>> Will do.
>>
>>> reply to v7 I did even suggest how to easily address the issue you
>>> did notice with large pages, as well as alternative behavior for
>>> vmap_to_mfn().
>>
>> One thing about adding SMALL_PAGES is that vmap is common code and I am
>> not sure if the Arm side is happy with it.
> 
> At the moment, Arm is only using small mapping but I plan to change that soon because we have regions that can be fairly big.
> 
> Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So I don't particularly like the idea to expose such trick in common code.
> 
> Even on x86, I think this is not the right approach. Such band-aid will impact the performance as, assuming superpages are used, it will take longer to map and add pressure on the TLBs.
> 
> I am aware that superpages will be useful for LiveUpdate, but is there any use cases in upstream?

Superpage use by vmalloc() is purely occasional: You'd have to vmalloc()
2Mb or more _and_ the page-wise allocation ought to return 512
consecutive pages in the right order. Getting 512 consecutive pages is
possible in practice, but with the page allocator allocating top-down it
is very unlikely for them to be returned in increasing-sorted order.

> If not, could we just use the BUG_ON() and implement correctly vmap_to_mfn() in a follow-up?

My main concern with the BUG_ON() is that it detects a problem long after
it was introduced (when the mapping was established). I'd rather see a
BUG_ON() added there if use of MAP_SMALL_PAGES is deemed unsuitable.

Jan
Julien Grall Aug. 18, 2020, 10:13 a.m. UTC | #5
Hi Jan,

On 18/08/2020 09:49, Jan Beulich wrote:
> On 13.08.2020 19:22, Julien Grall wrote:
>> Hi,
>>
>> On 13/08/2020 17:08, Hongyan Xia wrote:
>>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>>
>>>>> Rewrite those functions to use the new APIs. Modify its callers to
>>>>> unmap
>>>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>>>> never
>>>>> useful unless accompanied by page clearing and a mapping, introduce
>>>>> a
>>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>>
>>>>> Note that the change of virt_to_xen_l1e() also requires
>>>>> vmap_to_mfn() to
>>>>> unmap the page, which requires domain_page.h header in vmap.
>>>>>
>>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>>
>>>>> ---
>>>>> Changed in v8:
>>>>> - s/virtual address/linear address/.
>>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>>
>>>> The justification for this should be recorded in the description. In
>>>
>>> Will do.
>>>
>>>> reply to v7 I did even suggest how to easily address the issue you
>>>> did notice with large pages, as well as alternative behavior for
>>>> vmap_to_mfn().
>>>
>>> One thing about adding SMALL_PAGES is that vmap is common code and I am
>>> not sure if the Arm side is happy with it.
>>
>> At the moment, Arm is only using small mapping but I plan to change that soon because we have regions that can be fairly big.
>>
>> Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So I don't particularly like the idea to expose such trick in common code.
>>
>> Even on x86, I think this is not the right approach. Such band-aid will impact the performance as, assuming superpages are used, it will take longer to map and add pressure on the TLBs.
>>
>> I am aware that superpages will be useful for LiveUpdate, but is there any use cases in upstream?
> 
> Superpage use by vmalloc() is purely occasional: You'd have to vmalloc()
> 2Mb or more _and_ the page-wise allocation ought to return 512
> consecutive pages in the right order. Getting 512 consecutive pages is
> possible in practice, but with the page allocator allocating top-down it
> is very unlikely for them to be returned in increasing-sorted order.
So your assumption here is vmap_to_mfn() can only be called on 
vmalloc-ed() area. While this may be the case in Xen today, the name 
clearly suggest it can be called on all vmap-ed region.

> 
>> If not, could we just use the BUG_ON() and implement correctly vmap_to_mfn() in a follow-up?
> 
> My main concern with the BUG_ON() is that it detects a problem long after
> it was introduced (when the mapping was established). I'd rather see a
> BUG_ON() added there if use of MAP_SMALL_PAGES is deemed unsuitable.

 From what you wrote, I would agree that vmalloc() is unlikely going to 
use superpages. But this is not going to solve the underlying problem 
with the rest of the vmap area.

So are you suggesting to use MAP_SMALL_PAGES for *all* the vmap()?

Cheers,
Jan Beulich Aug. 18, 2020, 11:30 a.m. UTC | #6
On 18.08.2020 12:13, Julien Grall wrote:
> Hi Jan,
> 
> On 18/08/2020 09:49, Jan Beulich wrote:
>> On 13.08.2020 19:22, Julien Grall wrote:
>>> Hi,
>>>
>>> On 13/08/2020 17:08, Hongyan Xia wrote:
>>>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>>>
>>>>>> Rewrite those functions to use the new APIs. Modify its callers to
>>>>>> unmap
>>>>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>>>>> never
>>>>>> useful unless accompanied by page clearing and a mapping, introduce
>>>>>> a
>>>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>>>
>>>>>> Note that the change of virt_to_xen_l1e() also requires
>>>>>> vmap_to_mfn() to
>>>>>> unmap the page, which requires domain_page.h header in vmap.
>>>>>>
>>>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>>>
>>>>>> ---
>>>>>> Changed in v8:
>>>>>> - s/virtual address/linear address/.
>>>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>>>
>>>>> The justification for this should be recorded in the description. In
>>>>
>>>> Will do.
>>>>
>>>>> reply to v7 I did even suggest how to easily address the issue you
>>>>> did notice with large pages, as well as alternative behavior for
>>>>> vmap_to_mfn().
>>>>
>>>> One thing about adding SMALL_PAGES is that vmap is common code and I am
>>>> not sure if the Arm side is happy with it.
>>>
>>> At the moment, Arm is only using small mapping but I plan to change that soon because we have regions that can be fairly big.
>>>
>>> Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So I don't particularly like the idea to expose such trick in common code.
>>>
>>> Even on x86, I think this is not the right approach. Such band-aid will impact the performance as, assuming superpages are used, it will take longer to map and add pressure on the TLBs.
>>>
>>> I am aware that superpages will be useful for LiveUpdate, but is there any use cases in upstream?
>>
>> Superpage use by vmalloc() is purely occasional: You'd have to vmalloc()
>> 2Mb or more _and_ the page-wise allocation ought to return 512
>> consecutive pages in the right order. Getting 512 consecutive pages is
>> possible in practice, but with the page allocator allocating top-down it
>> is very unlikely for them to be returned in increasing-sorted order.
> So your assumption here is vmap_to_mfn() can only be called on vmalloc-ed() area. While this may be the case in Xen today, the name clearly suggest it can be called on all vmap-ed region.

No, I don't make this assumption, and I did spell this out in an earlier
reply to Hongyan: Parties using vmap() on a sufficiently large address
range with consecutive MFNs simply have to be aware that they may not
call vmap_to_mfn(). And why would they? They had the MFNs in their hands
at the time of mapping, so no need to (re-)obtain them by looking up the
translation.

>>> If not, could we just use the BUG_ON() and implement correctly vmap_to_mfn() in a follow-up?
>>
>> My main concern with the BUG_ON() is that it detects a problem long after
>> it was introduced (when the mapping was established). I'd rather see a
>> BUG_ON() added there if use of MAP_SMALL_PAGES is deemed unsuitable.
> 
> From what you wrote, I would agree that vmalloc() is unlikely going to use superpages. But this is not going to solve the underlying problem with the rest of the vmap area.
> 
> So are you suggesting to use MAP_SMALL_PAGES for *all* the vmap()?

As per above - no.

Jan
Julien Grall Aug. 18, 2020, 1:08 p.m. UTC | #7
Hi Jan,

On 18/08/2020 12:30, Jan Beulich wrote:
> On 18.08.2020 12:13, Julien Grall wrote:
>> Hi Jan,
>>
>> On 18/08/2020 09:49, Jan Beulich wrote:
>>> On 13.08.2020 19:22, Julien Grall wrote:
>>>> Hi,
>>>>
>>>> On 13/08/2020 17:08, Hongyan Xia wrote:
>>>>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>>>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>>>>
>>>>>>> Rewrite those functions to use the new APIs. Modify its callers to
>>>>>>> unmap
>>>>>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>>>>>> never
>>>>>>> useful unless accompanied by page clearing and a mapping, introduce
>>>>>>> a
>>>>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>>>>
>>>>>>> Note that the change of virt_to_xen_l1e() also requires
>>>>>>> vmap_to_mfn() to
>>>>>>> unmap the page, which requires domain_page.h header in vmap.
>>>>>>>
>>>>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>>>>
>>>>>>> ---
>>>>>>> Changed in v8:
>>>>>>> - s/virtual address/linear address/.
>>>>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>>>>
>>>>>> The justification for this should be recorded in the description. In
>>>>>
>>>>> Will do.
>>>>>
>>>>>> reply to v7 I did even suggest how to easily address the issue you
>>>>>> did notice with large pages, as well as alternative behavior for
>>>>>> vmap_to_mfn().
>>>>>
>>>>> One thing about adding SMALL_PAGES is that vmap is common code and I am
>>>>> not sure if the Arm side is happy with it.
>>>>
>>>> At the moment, Arm is only using small mapping but I plan to change that soon because we have regions that can be fairly big.
>>>>
>>>> Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So I don't particularly like the idea to expose such trick in common code.
>>>>
>>>> Even on x86, I think this is not the right approach. Such band-aid will impact the performance as, assuming superpages are used, it will take longer to map and add pressure on the TLBs.
>>>>
>>>> I am aware that superpages will be useful for LiveUpdate, but is there any use cases in upstream?
>>>
>>> Superpage use by vmalloc() is purely occasional: You'd have to vmalloc()
>>> 2Mb or more _and_ the page-wise allocation ought to return 512
>>> consecutive pages in the right order. Getting 512 consecutive pages is
>>> possible in practice, but with the page allocator allocating top-down it
>>> is very unlikely for them to be returned in increasing-sorted order.
>> So your assumption here is vmap_to_mfn() can only be called on vmalloc-ed() area. While this may be the case in Xen today, the name clearly suggest it can be called on all vmap-ed region.
> 
> No, I don't make this assumption, and I did spell this out in an earlier
> reply to Hongyan: Parties using vmap() on a sufficiently large address
> range with consecutive MFNs simply have to be aware that they may not
> call vmap_to_mfn().

You make it sounds easy to be aware, however there are two 
implementations of vmap_to_mfn() (one per arch). Even looking at the x86 
version, it is not obvious there is a restriction.

So I am a bit concerned of the "misuse" of the function. This could 
possibly be documented. Although, I am not entirely happy to restrict 
the use because of x86.

> And why would they? They had the MFNs in their hands
> at the time of mapping, so no need to (re-)obtain them by looking up the
> translation.

It may not always be convenient to keep the MFN in hand for the duration 
of the mapping.

An example was discussed in [1]. Roughly, the code would look like:

acpi_os_alloc_memory(...)
{
      mfn = alloc_boot_pages(...);
      vmap(mfn, ...);
}


acpi_os_free_memory(...)
{
     mfn = vmap_to_mfn(...);
     vunmap(...);

     init_boot_pages(mfn, ...);
}

>>>> If not, could we just use the BUG_ON() and implement correctly vmap_to_mfn() in a follow-up?
>>>
>>> My main concern with the BUG_ON() is that it detects a problem long after
>>> it was introduced (when the mapping was established). I'd rather see a
>>> BUG_ON() added there if use of MAP_SMALL_PAGES is deemed unsuitable.
>>
>>  From what you wrote, I would agree that vmalloc() is unlikely going to use superpages. But this is not going to solve the underlying problem with the rest of the vmap area.
>>
>> So are you suggesting to use MAP_SMALL_PAGES for *all* the vmap()?
> 
> As per above - no.
> 
> Jan
> 

[1] 
<a71d1903267b84afdb0e54fa2ac55540ab2f9357.1588278317.git.hongyxia@amazon.com>
Jan Beulich Aug. 18, 2020, 4:16 p.m. UTC | #8
On 18.08.2020 15:08, Julien Grall wrote:
> Hi Jan,
> 
> On 18/08/2020 12:30, Jan Beulich wrote:
>> On 18.08.2020 12:13, Julien Grall wrote:
>>> Hi Jan,
>>>
>>> On 18/08/2020 09:49, Jan Beulich wrote:
>>>> On 13.08.2020 19:22, Julien Grall wrote:
>>>>> Hi,
>>>>>
>>>>> On 13/08/2020 17:08, Hongyan Xia wrote:
>>>>>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>>>>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>>>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>>>>>
>>>>>>>> Rewrite those functions to use the new APIs. Modify its callers to
>>>>>>>> unmap
>>>>>>>> the pointer returned. Since alloc_xen_pagetable_new() is almost
>>>>>>>> never
>>>>>>>> useful unless accompanied by page clearing and a mapping, introduce
>>>>>>>> a
>>>>>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>>>>>
>>>>>>>> Note that the change of virt_to_xen_l1e() also requires
>>>>>>>> vmap_to_mfn() to
>>>>>>>> unmap the page, which requires domain_page.h header in vmap.
>>>>>>>>
>>>>>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>>>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>>>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>>>>>
>>>>>>>> ---
>>>>>>>> Changed in v8:
>>>>>>>> - s/virtual address/linear address/.
>>>>>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>>>>>
>>>>>>> The justification for this should be recorded in the description. In
>>>>>>
>>>>>> Will do.
>>>>>>
>>>>>>> reply to v7 I did even suggest how to easily address the issue you
>>>>>>> did notice with large pages, as well as alternative behavior for
>>>>>>> vmap_to_mfn().
>>>>>>
>>>>>> One thing about adding SMALL_PAGES is that vmap is common code and I am
>>>>>> not sure if the Arm side is happy with it.
>>>>>
>>>>> At the moment, Arm is only using small mapping but I plan to change that soon because we have regions that can be fairly big.
>>>>>
>>>>> Regardless that, the issue with vmap_to_mfn() is rather x86 specific. So I don't particularly like the idea to expose such trick in common code.
>>>>>
>>>>> Even on x86, I think this is not the right approach. Such band-aid will impact the performance as, assuming superpages are used, it will take longer to map and add pressure on the TLBs.
>>>>>
>>>>> I am aware that superpages will be useful for LiveUpdate, but is there any use cases in upstream?
>>>>
>>>> Superpage use by vmalloc() is purely occasional: You'd have to vmalloc()
>>>> 2Mb or more _and_ the page-wise allocation ought to return 512
>>>> consecutive pages in the right order. Getting 512 consecutive pages is
>>>> possible in practice, but with the page allocator allocating top-down it
>>>> is very unlikely for them to be returned in increasing-sorted order.
>>> So your assumption here is vmap_to_mfn() can only be called on vmalloc-ed() area. While this may be the case in Xen today, the name clearly suggest it can be called on all vmap-ed region.
>>
>> No, I don't make this assumption, and I did spell this out in an earlier
>> reply to Hongyan: Parties using vmap() on a sufficiently large address
>> range with consecutive MFNs simply have to be aware that they may not
>> call vmap_to_mfn().
> 
> You make it sounds easy to be aware, however there are two implementations of vmap_to_mfn() (one per arch). Even looking at the x86 version, it is not obvious there is a restriction.

I didn't mean to make it sound like this - I agree it's not an obvious
restriction.

> So I am a bit concerned of the "misuse" of the function. This could possibly be documented. Although, I am not entirely happy to restrict the use because of x86.

Unless the underlying issue gets fixed, we need _some_ form of bodge.
I'm not really happy with the BUG_ON() as proposed by Hongyan. You're
not really happy with my first proposed alternative, and you didn't
comment on the 2nd one (still kept in context below). Not sure what
to do: Throw dice?

>> And why would they? They had the MFNs in their hands
>> at the time of mapping, so no need to (re-)obtain them by looking up the
>> translation.
> 
> It may not always be convenient to keep the MFN in hand for the duration of the mapping.
> 
> An example was discussed in [1]. Roughly, the code would look like:
> 
> acpi_os_alloc_memory(...)
> {
>      mfn = alloc_boot_pages(...);
>      vmap(mfn, ...);
> }
> 
> 
> acpi_os_free_memory(...)
> {
>     mfn = vmap_to_mfn(...);
>     vunmap(...);
> 
>     init_boot_pages(mfn, ...);
> }

To a certain degree I'd consider this an abuse of the interface, but
yes, I see your point (and I was aware that there are possible cases
where keeping the MFN(s) in hand may be undesirable). Then again
there's very little use of vmap_to_mfn() in the first place, so I
didn't think it was very likely that a problematic case appeared
until the proper fixing of the issue.

Jan

>>>>> If not, could we just use the BUG_ON() and implement correctly vmap_to_mfn() in a follow-up?
>>>>
>>>> My main concern with the BUG_ON() is that it detects a problem long after
>>>> it was introduced (when the mapping was established). I'd rather see a
>>>> BUG_ON() added there if use of MAP_SMALL_PAGES is deemed unsuitable.
>>>
>>>  From what you wrote, I would agree that vmalloc() is unlikely going to use superpages. But this is not going to solve the underlying problem with the rest of the vmap area.
>>>
>>> So are you suggesting to use MAP_SMALL_PAGES for *all* the vmap()?
>>
>> As per above - no.
>>
>> Jan
>>
> 
> [1] <a71d1903267b84afdb0e54fa2ac55540ab2f9357.1588278317.git.hongyxia@amazon.com>
>
Hongyan Xia Nov. 30, 2020, 12:13 p.m. UTC | #9
Sorry for the late reply. Been busy with something else.

On Tue, 2020-08-18 at 18:16 +0200, Jan Beulich wrote:
> On 18.08.2020 15:08, Julien Grall wrote:
> > Hi Jan,
> > 
> > On 18/08/2020 12:30, Jan Beulich wrote:
> > > On 18.08.2020 12:13, Julien Grall wrote:
> > > > Hi Jan,
> > > > 
> > > > On 18/08/2020 09:49, Jan Beulich wrote:
> > > > > On 13.08.2020 19:22, Julien Grall wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > On 13/08/2020 17:08, Hongyan Xia wrote:
> > > > > > > On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
> > > > > > > > On 27.07.2020 16:21, Hongyan Xia wrote:
> > > > > > > > > From: Wei Liu <wei.liu2@citrix.com>
> > > > > > > > > 
> > > > > > > > > Rewrite those functions to use the new APIs. Modify
> > > > > > > > > its callers to
> > > > > > > > > unmap
> > > > > > > > > the pointer returned. Since alloc_xen_pagetable_new()
> > > > > > > > > is almost
> > > > > > > > > never
> > > > > > > > > useful unless accompanied by page clearing and a
> > > > > > > > > mapping, introduce
> > > > > > > > > a
> > > > > > > > > helper alloc_map_clear_xen_pt() for this sequence.
> > > > > > > > > 
> > > > > > > > > Note that the change of virt_to_xen_l1e() also
> > > > > > > > > requires
> > > > > > > > > vmap_to_mfn() to
> > > > > > > > > unmap the page, which requires domain_page.h header
> > > > > > > > > in vmap.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > > > > > > > > Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> > > > > > > > > Reviewed-by: Jan Beulich <jbeulich@suse.com>
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > > Changed in v8:
> > > > > > > > > - s/virtual address/linear address/.
> > > > > > > > > - BUG_ON() on NULL return in vmap_to_mfn().
> > > > > > > > 
> > > > > > > > The justification for this should be recorded in the
> > > > > > > > description. In
> > > > > > > 
> > > > > > > Will do.
> > > > > > > 
> > > > > > > > reply to v7 I did even suggest how to easily address
> > > > > > > > the issue you
> > > > > > > > did notice with large pages, as well as alternative
> > > > > > > > behavior for
> > > > > > > > vmap_to_mfn().
> > > > > > > 
> > > > > > > One thing about adding SMALL_PAGES is that vmap is common
> > > > > > > code and I am
> > > > > > > not sure if the Arm side is happy with it.
> > > > > > 
> > > > > > At the moment, Arm is only using small mapping but I plan
> > > > > > to change that soon because we have regions that can be
> > > > > > fairly big.
> > > > > > 
> > > > > > Regardless that, the issue with vmap_to_mfn() is rather x86
> > > > > > specific. So I don't particularly like the idea to expose
> > > > > > such trick in common code.
> > > > > > 
> > > > > > Even on x86, I think this is not the right approach. Such
> > > > > > band-aid will impact the performance as, assuming
> > > > > > superpages are used, it will take longer to map and add
> > > > > > pressure on the TLBs.
> > > > > > 
> > > > > > I am aware that superpages will be useful for LiveUpdate,
> > > > > > but is there any use cases in upstream?
> > > > > 
> > > > > Superpage use by vmalloc() is purely occasional: You'd have
> > > > > to vmalloc()
> > > > > 2Mb or more _and_ the page-wise allocation ought to return
> > > > > 512
> > > > > consecutive pages in the right order. Getting 512 consecutive
> > > > > pages is
> > > > > possible in practice, but with the page allocator allocating
> > > > > top-down it
> > > > > is very unlikely for them to be returned in increasing-sorted 
> > > > > order.
> > > > 
> > > > So your assumption here is vmap_to_mfn() can only be called on
> > > > vmalloc-ed() area. While this may be the case in Xen today, the
> > > > name clearly suggest it can be called on all vmap-ed region.
> > > 
> > > No, I don't make this assumption, and I did spell this out in an
> > > earlier
> > > reply to Hongyan: Parties using vmap() on a sufficiently large
> > > address
> > > range with consecutive MFNs simply have to be aware that they may
> > > not
> > > call vmap_to_mfn().
> > 
> > You make it sounds easy to be aware, however there are two
> > implementations of vmap_to_mfn() (one per arch). Even looking at
> > the x86 version, it is not obvious there is a restriction.
> 
> I didn't mean to make it sound like this - I agree it's not an
> obvious
> restriction.
> 
> > So I am a bit concerned of the "misuse" of the function. This could
> > possibly be documented. Although, I am not entirely happy to
> > restrict the use because of x86.
> 
> Unless the underlying issue gets fixed, we need _some_ form of bodge.
> I'm not really happy with the BUG_ON() as proposed by Hongyan. You're
> not really happy with my first proposed alternative, and you didn't
> comment on the 2nd one (still kept in context below). Not sure what
> to do: Throw dice?

Actually I did not propose the BUG_ON() fix. I was just in favor of it
when Jan presented it as a choice in v7.

The reason I am in favor of it is that even without it, the existing
x86 code already BUG_ON() when vmap has a superpage anyway, so it's not
like other alternatives behave any differently for superpages. I am
also not sure about returning INVALID_MFN because if virt_to_xen_l1e()
really returns NULL, then we are calling vmap_to_mfn() on a non-vmap
address (not even populated) which frankly deserves at least ASSERT().
So, I don't think BUG_ON() is a bad idea for now before vmap_to_mfn()
supports superpages.

Of course, we could use MAP_SMALL_PAGES to avoid the whole problem, but
Arm may not be happy. After a quick chat with Julien, how about having
ARCH_VMAP_FLAGS and only small pages for x86?

Hongyan
Jan Beulich Nov. 30, 2020, 12:50 p.m. UTC | #10
On 30.11.2020 13:13, Hongyan Xia wrote:
> On Tue, 2020-08-18 at 18:16 +0200, Jan Beulich wrote:
>> On 18.08.2020 15:08, Julien Grall wrote:
>>> On 18/08/2020 12:30, Jan Beulich wrote:
>>>> On 18.08.2020 12:13, Julien Grall wrote:
>>>>> On 18/08/2020 09:49, Jan Beulich wrote:
>>>>>> On 13.08.2020 19:22, Julien Grall wrote:
>>>>>>> On 13/08/2020 17:08, Hongyan Xia wrote:
>>>>>>>> On Fri, 2020-08-07 at 16:05 +0200, Jan Beulich wrote:
>>>>>>>>> On 27.07.2020 16:21, Hongyan Xia wrote:
>>>>>>>>>> From: Wei Liu <wei.liu2@citrix.com>
>>>>>>>>>>
>>>>>>>>>> Rewrite those functions to use the new APIs. Modify
>>>>>>>>>> its callers to
>>>>>>>>>> unmap
>>>>>>>>>> the pointer returned. Since alloc_xen_pagetable_new()
>>>>>>>>>> is almost
>>>>>>>>>> never
>>>>>>>>>> useful unless accompanied by page clearing and a
>>>>>>>>>> mapping, introduce
>>>>>>>>>> a
>>>>>>>>>> helper alloc_map_clear_xen_pt() for this sequence.
>>>>>>>>>>
>>>>>>>>>> Note that the change of virt_to_xen_l1e() also
>>>>>>>>>> requires
>>>>>>>>>> vmap_to_mfn() to
>>>>>>>>>> unmap the page, which requires domain_page.h header
>>>>>>>>>> in vmap.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
>>>>>>>>>> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
>>>>>>>>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>> Changed in v8:
>>>>>>>>>> - s/virtual address/linear address/.
>>>>>>>>>> - BUG_ON() on NULL return in vmap_to_mfn().
>>>>>>>>>
>>>>>>>>> The justification for this should be recorded in the
>>>>>>>>> description. In
>>>>>>>>
>>>>>>>> Will do.
>>>>>>>>
>>>>>>>>> reply to v7 I did even suggest how to easily address
>>>>>>>>> the issue you
>>>>>>>>> did notice with large pages, as well as alternative
>>>>>>>>> behavior for
>>>>>>>>> vmap_to_mfn().
>>>>>>>>
>>>>>>>> One thing about adding SMALL_PAGES is that vmap is common
>>>>>>>> code and I am
>>>>>>>> not sure if the Arm side is happy with it.
>>>>>>>
>>>>>>> At the moment, Arm is only using small mapping but I plan
>>>>>>> to change that soon because we have regions that can be
>>>>>>> fairly big.
>>>>>>>
>>>>>>> Regardless that, the issue with vmap_to_mfn() is rather x86
>>>>>>> specific. So I don't particularly like the idea to expose
>>>>>>> such trick in common code.
>>>>>>>
>>>>>>> Even on x86, I think this is not the right approach. Such
>>>>>>> band-aid will impact the performance as, assuming
>>>>>>> superpages are used, it will take longer to map and add
>>>>>>> pressure on the TLBs.
>>>>>>>
>>>>>>> I am aware that superpages will be useful for LiveUpdate,
>>>>>>> but is there any use cases in upstream?
>>>>>>
>>>>>> Superpage use by vmalloc() is purely occasional: You'd have
>>>>>> to vmalloc()
>>>>>> 2Mb or more _and_ the page-wise allocation ought to return
>>>>>> 512
>>>>>> consecutive pages in the right order. Getting 512 consecutive
>>>>>> pages is
>>>>>> possible in practice, but with the page allocator allocating
>>>>>> top-down it
>>>>>> is very unlikely for them to be returned in increasing-sorted 
>>>>>> order.
>>>>>
>>>>> So your assumption here is vmap_to_mfn() can only be called on
>>>>> vmalloc-ed() area. While this may be the case in Xen today, the
>>>>> name clearly suggest it can be called on all vmap-ed region.
>>>>
>>>> No, I don't make this assumption, and I did spell this out in an
>>>> earlier
>>>> reply to Hongyan: Parties using vmap() on a sufficiently large
>>>> address
>>>> range with consecutive MFNs simply have to be aware that they may
>>>> not
>>>> call vmap_to_mfn().
>>>
>>> You make it sounds easy to be aware, however there are two
>>> implementations of vmap_to_mfn() (one per arch). Even looking at
>>> the x86 version, it is not obvious there is a restriction.
>>
>> I didn't mean to make it sound like this - I agree it's not an
>> obvious
>> restriction.
>>
>>> So I am a bit concerned of the "misuse" of the function. This could
>>> possibly be documented. Although, I am not entirely happy to
>>> restrict the use because of x86.
>>
>> Unless the underlying issue gets fixed, we need _some_ form of bodge.
>> I'm not really happy with the BUG_ON() as proposed by Hongyan. You're
>> not really happy with my first proposed alternative, and you didn't
>> comment on the 2nd one (still kept in context below). Not sure what
>> to do: Throw dice?
> 
> Actually I did not propose the BUG_ON() fix. I was just in favor of it
> when Jan presented it as a choice in v7.
> 
> The reason I am in favor of it is that even without it, the existing
> x86 code already BUG_ON() when vmap has a superpage anyway, so it's not
> like other alternatives behave any differently for superpages. I am
> also not sure about returning INVALID_MFN because if virt_to_xen_l1e()
> really returns NULL, then we are calling vmap_to_mfn() on a non-vmap
> address (not even populated) which frankly deserves at least ASSERT().
> So, I don't think BUG_ON() is a bad idea for now before vmap_to_mfn()
> supports superpages.
> 
> Of course, we could use MAP_SMALL_PAGES to avoid the whole problem, but
> Arm may not be happy. After a quick chat with Julien, how about having
> ARCH_VMAP_FLAGS and only small pages for x86?

Possibly, albeit this will then make it look less like a bodge and
more like we would want to keep it this way. How difficult would it
be to actually make the thing work with superpages? Is it more than
simply
(pseudocode, potentially needed locking omitted):

vmap_to_mfn(va) {
    pl1e = virt_to_xen_l1e(va);
    if ( pl1e )
        return l1e_get_mfn(*pl1e);
    pl2e = virt_to_xen_l2e(va);
    if ( pl2e )
        return l2e_get_mfn(*pl2e) + suitable_bits(va);
    return l3e_get_mfn(*virt_to_xen_l3e(va)) + suitable_bits(va);
}

(assuming virt_to_xen_l<N>e() would be returning NULL in such a case)?
Not very efficient, but not needed anywhere anyway - the sole user of
the construct is domain_page_map_to_mfn(), which maps only individual
pages. (An even better option would be to avoid the recurring walk of
the higher levels by using only virt_to_xen_l3e() and then doing the
remaining steps "by hand". This would at once allow to avoid the here
unwanted / unneeded checking for whether page tables need allocating.)

Jan
Hongyan Xia Nov. 30, 2020, 2:13 p.m. UTC | #11
On Mon, 2020-11-30 at 13:50 +0100, Jan Beulich wrote:
> On 30.11.2020 13:13, Hongyan Xia wrote:
> > On Tue, 2020-08-18 at 18:16 +0200, Jan Beulich wrote:
> > [...]
> > 
> > Actually I did not propose the BUG_ON() fix. I was just in favor of
> > it
> > when Jan presented it as a choice in v7.
> > 
> > The reason I am in favor of it is that even without it, the
> > existing
> > x86 code already BUG_ON() when vmap has a superpage anyway, so it's
> > not
> > like other alternatives behave any differently for superpages. I am
> > also not sure about returning INVALID_MFN because if
> > virt_to_xen_l1e()
> > really returns NULL, then we are calling vmap_to_mfn() on a non-
> > vmap
> > address (not even populated) which frankly deserves at least
> > ASSERT().
> > So, I don't think BUG_ON() is a bad idea for now before
> > vmap_to_mfn()
> > supports superpages.
> > 
> > Of course, we could use MAP_SMALL_PAGES to avoid the whole problem,
> > but
> > Arm may not be happy. After a quick chat with Julien, how about
> > having
> > ARCH_VMAP_FLAGS and only small pages for x86?
> 
> Possibly, albeit this will then make it look less like a bodge and
> more like we would want to keep it this way. How difficult would it
> be to actually make the thing work with superpages? Is it more than
> simply
> (pseudocode, potentially needed locking omitted):
> 
> vmap_to_mfn(va) {
>     pl1e = virt_to_xen_l1e(va);
>     if ( pl1e )
>         return l1e_get_mfn(*pl1e);
>     pl2e = virt_to_xen_l2e(va);
>     if ( pl2e )
>         return l2e_get_mfn(*pl2e) + suitable_bits(va);
>     return l3e_get_mfn(*virt_to_xen_l3e(va)) + suitable_bits(va);
> }
> 
> (assuming virt_to_xen_l<N>e() would be returning NULL in such a
> case)?

The sad part is that instead of returning NULL, such functions BUG_ON()
when there is a superpage, hence why this solution was not considered.
Changing the logic from BUG_ON to returning NULL might not be
straightforward, since so far the callers assume NULL means -ENOMEM and
not anything else.

> Not very efficient, but not needed anywhere anyway - the sole user of
> the construct is domain_page_map_to_mfn(), which maps only individual
> pages. (An even better option would be to avoid the recurring walk of
> the higher levels by using only virt_to_xen_l3e() and then doing the
> remaining steps "by hand". This would at once allow to avoid the here
> unwanted / unneeded checking for whether page tables need
> allocating.)

The "even better option" looks more promising to me, and is what I want
to go forward with. At any rate, this fix grows larger than intended
and I want to send this as an individual patch. Any objections?

Hongyan
Jan Beulich Nov. 30, 2020, 2:47 p.m. UTC | #12
On 30.11.2020 15:13, Hongyan Xia wrote:
> On Mon, 2020-11-30 at 13:50 +0100, Jan Beulich wrote:
>> Not very efficient, but not needed anywhere anyway - the sole user of
>> the construct is domain_page_map_to_mfn(), which maps only individual
>> pages. (An even better option would be to avoid the recurring walk of
>> the higher levels by using only virt_to_xen_l3e() and then doing the
>> remaining steps "by hand". This would at once allow to avoid the here
>> unwanted / unneeded checking for whether page tables need
>> allocating.)
> 
> The "even better option" looks more promising to me, and is what I want
> to go forward with. At any rate, this fix grows larger than intended
> and I want to send this as an individual patch. Any objections?

Definitely not - separate changes are almost always easier to look
at and faster to get in.

Jan
Hongyan Xia Dec. 7, 2020, 3:28 p.m. UTC | #13
On Mon, 2020-07-27 at 15:21 +0100, Hongyan Xia wrote:
> From: Wei Liu <wei.liu2@citrix.com>
> 
> Rewrite those functions to use the new APIs. Modify its callers to
> unmap
> the pointer returned. Since alloc_xen_pagetable_new() is almost never
> useful unless accompanied by page clearing and a mapping, introduce a
> helper alloc_map_clear_xen_pt() for this sequence.
> 
> Note that the change of virt_to_xen_l1e() also requires vmap_to_mfn()
> to
> unmap the page, which requires domain_page.h header in vmap.
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

I believe the vmap part can be removed since x86 now handles
superpages.

> @@ -5085,8 +5117,8 @@ int map_pages_to_xen(
>      unsigned int flags)
>  {
>      bool locking = system_state > SYS_STATE_boot;
> -    l3_pgentry_t *pl3e, ol3e;
> -    l2_pgentry_t *pl2e, ol2e;
> +    l3_pgentry_t *pl3e = NULL, ol3e;
> +    l2_pgentry_t *pl2e = NULL, ol2e;
>      l1_pgentry_t *pl1e, ol1e;
>      unsigned int  i;
>      int rc = -ENOMEM;
> @@ -5107,6 +5139,10 @@ int map_pages_to_xen(
>  
>      while ( nr_mfns != 0 )
>      {
> +        /* Clean up mappings mapped in the previous iteration. */
> +        UNMAP_DOMAIN_PAGE(pl3e);
> +        UNMAP_DOMAIN_PAGE(pl2e);
> +
>          pl3e = virt_to_xen_l3e(virt);
>  
>          if ( !pl3e )

While rebasing, I found another issue. XSA-345 now locks the L3 table
by L3T_LOCK(virt_to_page(pl3e)) but for this series we cannot call
virt_to_page() here.

We could call domain_page_map_to_mfn() on pl3e to get back the mfn,
which should be fine since this function is rarely used outside boot so
the overhead should be low. We could probably pass an *mfn in as an
additional argument, but do we want to change this also for
virt_to_xen_l[21]e() to be consistent (although they don't need the
mfn)? I might also need to remove the R-b due to this non-trivial
change.

Thoughts?

Hongyan
diff mbox series

Patch

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index b03728e18e..dc8627c1b5 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -333,21 +333,24 @@  void unmap_domain_page_global(const void *ptr)
 mfn_t domain_page_map_to_mfn(const void *ptr)
 {
     unsigned long va = (unsigned long)ptr;
-    const l1_pgentry_t *pl1e;
+    l1_pgentry_t l1e;
 
     if ( va >= DIRECTMAP_VIRT_START )
         return _mfn(virt_to_mfn(ptr));
 
     if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
     {
-        pl1e = virt_to_xen_l1e(va);
+        const l1_pgentry_t *pl1e = virt_to_xen_l1e(va);
+
         BUG_ON(!pl1e);
+        l1e = *pl1e;
+        unmap_domain_page(pl1e);
     }
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        l1e = __linear_l1_table[l1_linear_offset(va)];
     }
 
-    return l1e_get_mfn(*pl1e);
+    return l1e_get_mfn(l1e);
 }
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 7a11d022c9..fd416c0282 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4965,8 +4965,28 @@  void free_xen_pagetable_new(mfn_t mfn)
         free_xenheap_page(mfn_to_virt(mfn_x(mfn)));
 }
 
+void *alloc_map_clear_xen_pt(mfn_t *pmfn)
+{
+    mfn_t mfn = alloc_xen_pagetable_new();
+    void *ret;
+
+    if ( mfn_eq(mfn, INVALID_MFN) )
+        return NULL;
+
+    if ( pmfn )
+        *pmfn = mfn;
+    ret = map_domain_page(mfn);
+    clear_page(ret);
+
+    return ret;
+}
+
 static DEFINE_SPINLOCK(map_pgdir_lock);
 
+/*
+ * For virt_to_xen_lXe() functions, they take a linear address and return a
+ * pointer to Xen's LX entry. Caller needs to unmap the pointer.
+ */
 static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
 {
     l4_pgentry_t *pl4e;
@@ -4975,33 +4995,33 @@  static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
     if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
     {
         bool locking = system_state > SYS_STATE_boot;
-        l3_pgentry_t *l3t = alloc_xen_pagetable();
+        mfn_t l3mfn;
+        l3_pgentry_t *l3t = alloc_map_clear_xen_pt(&l3mfn);
 
         if ( !l3t )
             return NULL;
-        clear_page(l3t);
+        UNMAP_DOMAIN_PAGE(l3t);
         if ( locking )
             spin_lock(&map_pgdir_lock);
         if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
         {
-            l4_pgentry_t l4e = l4e_from_paddr(__pa(l3t), __PAGE_HYPERVISOR);
+            l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR);
 
             l4e_write(pl4e, l4e);
             efi_update_l4_pgtable(l4_table_offset(v), l4e);
-            l3t = NULL;
+            l3mfn = INVALID_MFN;
         }
         if ( locking )
             spin_unlock(&map_pgdir_lock);
-        if ( l3t )
-            free_xen_pagetable(l3t);
+        free_xen_pagetable_new(l3mfn);
     }
 
-    return l4e_to_l3e(*pl4e) + l3_table_offset(v);
+    return map_l3t_from_l4e(*pl4e) + l3_table_offset(v);
 }
 
 static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
 {
-    l3_pgentry_t *pl3e;
+    l3_pgentry_t *pl3e, l3e;
 
     pl3e = virt_to_xen_l3e(v);
     if ( !pl3e )
@@ -5010,31 +5030,37 @@  static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
     {
         bool locking = system_state > SYS_STATE_boot;
-        l2_pgentry_t *l2t = alloc_xen_pagetable();
+        mfn_t l2mfn;
+        l2_pgentry_t *l2t = alloc_map_clear_xen_pt(&l2mfn);
 
         if ( !l2t )
+        {
+            unmap_domain_page(pl3e);
             return NULL;
-        clear_page(l2t);
+        }
+        UNMAP_DOMAIN_PAGE(l2t);
         if ( locking )
             spin_lock(&map_pgdir_lock);
         if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
         {
-            l3e_write(pl3e, l3e_from_paddr(__pa(l2t), __PAGE_HYPERVISOR));
-            l2t = NULL;
+            l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR));
+            l2mfn = INVALID_MFN;
         }
         if ( locking )
             spin_unlock(&map_pgdir_lock);
-        if ( l2t )
-            free_xen_pagetable(l2t);
+        free_xen_pagetable_new(l2mfn);
     }
 
     BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
-    return l3e_to_l2e(*pl3e) + l2_table_offset(v);
+    l3e = *pl3e;
+    unmap_domain_page(pl3e);
+
+    return map_l2t_from_l3e(l3e) + l2_table_offset(v);
 }
 
 l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
 {
-    l2_pgentry_t *pl2e;
+    l2_pgentry_t *pl2e, l2e;
 
     pl2e = virt_to_xen_l2e(v);
     if ( !pl2e )
@@ -5043,26 +5069,32 @@  l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
     if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
     {
         bool locking = system_state > SYS_STATE_boot;
-        l1_pgentry_t *l1t = alloc_xen_pagetable();
+        mfn_t l1mfn;
+        l1_pgentry_t *l1t = alloc_map_clear_xen_pt(&l1mfn);
 
         if ( !l1t )
+        {
+            unmap_domain_page(pl2e);
             return NULL;
-        clear_page(l1t);
+        }
+        UNMAP_DOMAIN_PAGE(l1t);
         if ( locking )
             spin_lock(&map_pgdir_lock);
         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
         {
-            l2e_write(pl2e, l2e_from_paddr(__pa(l1t), __PAGE_HYPERVISOR));
-            l1t = NULL;
+            l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR));
+            l1mfn = INVALID_MFN;
         }
         if ( locking )
             spin_unlock(&map_pgdir_lock);
-        if ( l1t )
-            free_xen_pagetable(l1t);
+        free_xen_pagetable_new(l1mfn);
     }
 
     BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE);
-    return l2e_to_l1e(*pl2e) + l1_table_offset(v);
+    l2e = *pl2e;
+    unmap_domain_page(pl2e);
+
+    return map_l1t_from_l2e(l2e) + l1_table_offset(v);
 }
 
 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
@@ -5085,8 +5117,8 @@  int map_pages_to_xen(
     unsigned int flags)
 {
     bool locking = system_state > SYS_STATE_boot;
-    l3_pgentry_t *pl3e, ol3e;
-    l2_pgentry_t *pl2e, ol2e;
+    l3_pgentry_t *pl3e = NULL, ol3e;
+    l2_pgentry_t *pl2e = NULL, ol2e;
     l1_pgentry_t *pl1e, ol1e;
     unsigned int  i;
     int rc = -ENOMEM;
@@ -5107,6 +5139,10 @@  int map_pages_to_xen(
 
     while ( nr_mfns != 0 )
     {
+        /* Clean up mappings mapped in the previous iteration. */
+        UNMAP_DOMAIN_PAGE(pl3e);
+        UNMAP_DOMAIN_PAGE(pl2e);
+
         pl3e = virt_to_xen_l3e(virt);
 
         if ( !pl3e )
@@ -5275,6 +5311,8 @@  int map_pages_to_xen(
                 pl1e = virt_to_xen_l1e(virt);
                 if ( pl1e == NULL )
                     goto out;
+
+                UNMAP_DOMAIN_PAGE(pl1e);
             }
             else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
             {
@@ -5451,6 +5489,8 @@  int map_pages_to_xen(
     rc = 0;
 
  out:
+    unmap_domain_page(pl2e);
+    unmap_domain_page(pl3e);
     return rc;
 }
 
@@ -5474,7 +5514,7 @@  int populate_pt_range(unsigned long virt, unsigned long nr_mfns)
 int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
 {
     bool locking = system_state > SYS_STATE_boot;
-    l3_pgentry_t *pl3e;
+    l3_pgentry_t *pl3e = NULL;
     l2_pgentry_t *pl2e;
     l1_pgentry_t *pl1e;
     unsigned int  i;
@@ -5490,6 +5530,9 @@  int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
 
     while ( v < e )
     {
+        /* Clean up mappings mapped in the previous iteration. */
+        UNMAP_DOMAIN_PAGE(pl3e);
+
         pl3e = virt_to_xen_l3e(v);
 
         if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
@@ -5718,6 +5761,7 @@  int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
     rc = 0;
 
  out:
+    unmap_domain_page(pl3e);
     return rc;
 }
 
diff --git a/xen/common/vmap.c b/xen/common/vmap.c
index faebc1ddf1..9964ab2096 100644
--- a/xen/common/vmap.c
+++ b/xen/common/vmap.c
@@ -1,6 +1,7 @@ 
 #ifdef VMAP_VIRT_START
 #include <xen/bitmap.h>
 #include <xen/cache.h>
+#include <xen/domain_page.h>
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/pfn.h>
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7e74996053..5b76308948 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -586,6 +586,7 @@  void *alloc_xen_pagetable(void);
 void free_xen_pagetable(void *v);
 mfn_t alloc_xen_pagetable_new(void);
 void free_xen_pagetable_new(mfn_t mfn);
+void *alloc_map_clear_xen_pt(mfn_t *pmfn);
 
 l1_pgentry_t *virt_to_xen_l1e(unsigned long v);
 
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index f632affaef..608a048c28 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -291,7 +291,15 @@  void copy_page_sse2(void *, const void *);
 #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
 #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
 #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
-#define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
+
+#define vmap_to_mfn(va) ({                                                  \
+        const l1_pgentry_t *pl1e_ = virt_to_xen_l1e((unsigned long)(va));   \
+        mfn_t mfn_;                                                         \
+        BUG_ON(!pl1e_);                                                     \
+        mfn_ = l1e_get_mfn(*pl1e_);                                         \
+        unmap_domain_page(pl1e_);                                           \
+        mfn_; })
+
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
 #endif /* !defined(__ASSEMBLY__) */