diff mbox

[v7,02/12] x86/mm: add HYPERVISOR_memory_op to acquire guest resources

Message ID 20170918153126.3058-3-paul.durrant@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paul Durrant Sept. 18, 2017, 3:31 p.m. UTC
Certain memory resources associated with a guest are not necessarily
present in the guest P2M and so are not necessarily available to be
foreign-mapped by a tools domain unless they are inserted, which risks
shattering a super-page mapping.

This patch adds a new memory op to allow such a resource to be priv-mapped
directly, by either a PV or HVM tools domain: grant table frames.

NOTE: Whilst the new op is not intrinsicly specific to the x86 architecture,
      I have no means to test it on an ARM platform and so cannot verify
      that it functions correctly. Hence it is currently only implemented
      for x86.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>

v5:
 - Switched __copy_to/from_guest_offset() to copy_to/from_guest_offset().
---
 xen/arch/x86/mm.c             | 111 ++++++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c         |   3 +-
 xen/common/grant_table.c      |  56 ++++++++++++++-------
 xen/include/asm-x86/p2m.h     |   3 ++
 xen/include/public/memory.h   |  38 ++++++++++++++-
 xen/include/xen/grant_table.h |   1 +
 6 files changed, 191 insertions(+), 21 deletions(-)

Comments

Jan Beulich Sept. 25, 2017, 1:49 p.m. UTC | #1
>>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
> Certain memory resources associated with a guest are not necessarily
> present in the guest P2M and so are not necessarily available to be
> foreign-mapped by a tools domain unless they are inserted, which risks
> shattering a super-page mapping.

For grant tables I can see this as the primary issue, but isn't the
goal of not exposing IOREQ server pages an even more important
aspect, and hence more relevant to mention here?

> NOTE: Whilst the new op is not intrinsicly specific to the x86 architecture,
>       I have no means to test it on an ARM platform and so cannot verify
>       that it functions correctly. Hence it is currently only implemented
>       for x86.

Which will require things to be moved around later on. May I
instead suggest to put it in common code and simply have a
small #ifdef somewhere causing the operation to fail early for
ARM?

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4768,6 +4768,107 @@ int xenmem_add_to_physmap_one(
>      return rc;
>  }
>  
> +static int xenmem_acquire_grant_table(struct domain *d,

I don't think static functions need a xenmem_ prefix.

> +static int xenmem_acquire_resource(xen_mem_acquire_resource_t *xmar)

const?

> +{
> +    struct domain *d, *currd = current->domain;
> +    unsigned long *mfn_list;
> +    int rc;
> +
> +    if ( xmar->nr_frames == 0 )
> +        return -EINVAL;
> +
> +    d = rcu_lock_domain_by_any_id(xmar->domid);
> +    if ( d == NULL )
> +        return -ESRCH;
> +
> +    rc = xsm_domain_memory_map(XSM_TARGET, d);
> +    if ( rc )
> +        goto out;
> +
> +    mfn_list = xmalloc_array(unsigned long, xmar->nr_frames);

Despite XSA-77 there should not be any new disaggregation related
security risks (here: memory exhaustion and long lasting loops).
Large requests need to be refused or broken up.

> --- a/xen/common/grant_table.c
> +++ b/xen/common/grant_table.c
> @@ -3607,38 +3607,58 @@ int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref,
>  }
>  #endif
>  
> -int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn,
> -                     mfn_t *mfn)
> -{
> -    int rc = 0;
>  
> -    grant_write_lock(d->grant_table);
> +static mfn_t gnttab_get_frame_locked(struct domain *d, unsigned long idx)
> +{
> +    struct grant_table *gt = d->grant_table;
> +    mfn_t mfn = INVALID_MFN;
>  
> -    if ( d->grant_table->gt_version == 0 )
> -        d->grant_table->gt_version = 1;
> +    if ( gt->gt_version == 0 )
> +        gt->gt_version = 1;
>  
> -    if ( d->grant_table->gt_version == 2 &&
> +    if ( gt->gt_version == 2 &&
>           (idx & XENMAPIDX_grant_table_status) )
>      {
>          idx &= ~XENMAPIDX_grant_table_status;

I don't see the use of this bit mentioned anywhere in the public
header addition.

> +mfn_t gnttab_get_frame(struct domain *d, unsigned long idx)
> +{
> +    mfn_t mfn;
> +
> +    grant_write_lock(d->grant_table);
> +    mfn = gnttab_get_frame_locked(d, idx);
> +    grant_write_unlock(d->grant_table);

Hmm, a read lock is insufficient here only because of the possible
bumping of the version from 0 to 1 afaict, but I don't really see
why what now becomes gnttab_get_frame_locked() does that in
the first place.

> --- a/xen/include/public/memory.h
> +++ b/xen/include/public/memory.h
> @@ -650,7 +650,43 @@ struct xen_vnuma_topology_info {
>  typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t);
>  
> -/* Next available subop number is 28 */
> +#if defined(__XEN__) || defined(__XEN_TOOLS__)
> +
> +/*
> + * Get the pages for a particular guest resource, so that they can be
> + * mapped directly by a tools domain.
> + */
> +#define XENMEM_acquire_resource 28
> +struct xen_mem_acquire_resource {
> +    /* IN - the domain whose resource is to be mapped */
> +    domid_t domid;
> +    /* IN - the type of resource (defined below) */
> +    uint16_t type;
> +
> +#define XENMEM_resource_grant_table 0
> +
> +    /*
> +     * IN - a type-specific resource identifier, which must be zero
> +     *      unless stated otherwise.
> +     */
> +    uint32_t id;
> +    /* IN - number of (4K) frames of the resource to be mapped */
> +    uint32_t nr_frames;
> +    /* IN - the index of the initial frame to be mapped */
> +    uint64_aligned_t frame;

There are 32 bits of unnamed padding ahead of this field - please
name it and check it's set to zero.

> +    /* IN/OUT - If the tools domain is PV then, upon return, gmfn_list
> +     *          will be populated with the MFNs of the resource.
> +     *          If the tools domain is HVM then it is expected that, on

s/tools domain/calling domain/ (twice)?

> +     *          entry, gmfn_list will be populated with a list of GFNs

s/will be/is/ to further emphasize the input vs output difference?

> +     *          that will be mapped to the MFNs of the resource.
> +     */
> +    XEN_GUEST_HANDLE(xen_pfn_t) gmfn_list;

What about a 32-bit x86 tool stack domain as caller here? I
don't think you want to limit it to just the low 16Tb? Nor are
you adding a compat translation layer.

> +};
> +typedef struct xen_mem_acquire_resource xen_mem_acquire_resource_t;
> +
> +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */

Also please group this with the other similar section, without
introducing a second identical #if. After all we have ...

> +/* Next available subop number is 29 */

... this to eliminate the need to have things numerically sorted in
this file.

Jan
Jan Beulich Sept. 25, 2017, 2:23 p.m. UTC | #2
>>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
> Certain memory resources associated with a guest are not necessarily
> present in the guest P2M and so are not necessarily available to be
> foreign-mapped by a tools domain unless they are inserted, which risks
> shattering a super-page mapping.

Btw., I'm additionally having trouble seeing this shattering of a
superpage: For one, xc_core_arch_get_scratch_gpfn() could be
a little less simplistic. And then even with the currently chosen
value (outside of the range of valid GFNs at that point in time)
there shouldn't be a larger page to be shattered, as there should
be no mapping at all at that index. But perhaps I'm just blind and
don't see the obvious ...

Jan
Paul Durrant Sept. 25, 2017, 2:53 p.m. UTC | #3
> -----Original Message-----

> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Jan

> Beulich

> Sent: 25 September 2017 14:50

> To: Paul Durrant <Paul.Durrant@citrix.com>

> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-

> devel@lists.xenproject.org

> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add

> HYPERVISOR_memory_op to acquire guest resources

> 

> >>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:

> > Certain memory resources associated with a guest are not necessarily

> > present in the guest P2M and so are not necessarily available to be

> > foreign-mapped by a tools domain unless they are inserted, which risks

> > shattering a super-page mapping.

> 

> For grant tables I can see this as the primary issue, but isn't the

> goal of not exposing IOREQ server pages an even more important

> aspect, and hence more relevant to mention here?

> 

> > NOTE: Whilst the new op is not intrinsicly specific to the x86 architecture,

> >       I have no means to test it on an ARM platform and so cannot verify

> >       that it functions correctly. Hence it is currently only implemented

> >       for x86.

> 

> Which will require things to be moved around later on. May I

> instead suggest to put it in common code and simply have a

> small #ifdef somewhere causing the operation to fail early for

> ARM?


Ok. If you prefer that approach then I'll do that.

> 

> > --- a/xen/arch/x86/mm.c

> > +++ b/xen/arch/x86/mm.c

> > @@ -4768,6 +4768,107 @@ int xenmem_add_to_physmap_one(

> >      return rc;

> >  }

> >

> > +static int xenmem_acquire_grant_table(struct domain *d,

> 

> I don't think static functions need a xenmem_ prefix.

> 


Ok.

> > +static int xenmem_acquire_resource(xen_mem_acquire_resource_t

> *xmar)

> 

> const?


Yes.

> 

> > +{

> > +    struct domain *d, *currd = current->domain;

> > +    unsigned long *mfn_list;

> > +    int rc;

> > +

> > +    if ( xmar->nr_frames == 0 )

> > +        return -EINVAL;

> > +

> > +    d = rcu_lock_domain_by_any_id(xmar->domid);

> > +    if ( d == NULL )

> > +        return -ESRCH;

> > +

> > +    rc = xsm_domain_memory_map(XSM_TARGET, d);

> > +    if ( rc )

> > +        goto out;

> > +

> > +    mfn_list = xmalloc_array(unsigned long, xmar->nr_frames);

> 

> Despite XSA-77 there should not be any new disaggregation related

> security risks (here: memory exhaustion and long lasting loops).

> Large requests need to be refused or broken up.


Ok, I'll put an upper limit on frames.

> 

> > --- a/xen/common/grant_table.c

> > +++ b/xen/common/grant_table.c

> > @@ -3607,38 +3607,58 @@ int mem_sharing_gref_to_gfn(struct

> grant_table *gt, grant_ref_t ref,

> >  }

> >  #endif

> >

> > -int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn,

> > -                     mfn_t *mfn)

> > -{

> > -    int rc = 0;

> >

> > -    grant_write_lock(d->grant_table);

> > +static mfn_t gnttab_get_frame_locked(struct domain *d, unsigned long

> idx)

> > +{

> > +    struct grant_table *gt = d->grant_table;

> > +    mfn_t mfn = INVALID_MFN;

> >

> > -    if ( d->grant_table->gt_version == 0 )

> > -        d->grant_table->gt_version = 1;

> > +    if ( gt->gt_version == 0 )

> > +        gt->gt_version = 1;

> >

> > -    if ( d->grant_table->gt_version == 2 &&

> > +    if ( gt->gt_version == 2 &&

> >           (idx & XENMAPIDX_grant_table_status) )

> >      {

> >          idx &= ~XENMAPIDX_grant_table_status;

> 

> I don't see the use of this bit mentioned anywhere in the public

> header addition.


Good point, I do need to mention v2 now that Juergen has revived it.

> 

> > +mfn_t gnttab_get_frame(struct domain *d, unsigned long idx)

> > +{

> > +    mfn_t mfn;

> > +

> > +    grant_write_lock(d->grant_table);

> > +    mfn = gnttab_get_frame_locked(d, idx);

> > +    grant_write_unlock(d->grant_table);

> 

> Hmm, a read lock is insufficient here only because of the possible

> bumping of the version from 0 to 1 afaict, but I don't really see

> why what now becomes gnttab_get_frame_locked() does that in

> the first place.


It may be the case that, after Juergen's recent re-arrangements, that the version will always be set by this point. I'll check and drop to and ASSERT on the version and read-lock if I can.

> 

> > --- a/xen/include/public/memory.h

> > +++ b/xen/include/public/memory.h

> > @@ -650,7 +650,43 @@ struct xen_vnuma_topology_info {

> >  typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t;

> >  DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t);

> >

> > -/* Next available subop number is 28 */

> > +#if defined(__XEN__) || defined(__XEN_TOOLS__)

> > +

> > +/*

> > + * Get the pages for a particular guest resource, so that they can be

> > + * mapped directly by a tools domain.

> > + */

> > +#define XENMEM_acquire_resource 28

> > +struct xen_mem_acquire_resource {

> > +    /* IN - the domain whose resource is to be mapped */

> > +    domid_t domid;

> > +    /* IN - the type of resource (defined below) */

> > +    uint16_t type;

> > +

> > +#define XENMEM_resource_grant_table 0

> > +

> > +    /*

> > +     * IN - a type-specific resource identifier, which must be zero

> > +     *      unless stated otherwise.

> > +     */

> > +    uint32_t id;

> > +    /* IN - number of (4K) frames of the resource to be mapped */

> > +    uint32_t nr_frames;

> > +    /* IN - the index of the initial frame to be mapped */

> > +    uint64_aligned_t frame;

> 

> There are 32 bits of unnamed padding ahead of this field - please

> name it and check it's set to zero.


Ah yes, for some reason the #define above had made me think things were aligned at this point.

> 

> > +    /* IN/OUT - If the tools domain is PV then, upon return, gmfn_list

> > +     *          will be populated with the MFNs of the resource.

> > +     *          If the tools domain is HVM then it is expected that, on

> 

> s/tools domain/calling domain/ (twice)?


Ok.

> 

> > +     *          entry, gmfn_list will be populated with a list of GFNs

> 

> s/will be/is/ to further emphasize the input vs output difference?


Ok.

> 

> > +     *          that will be mapped to the MFNs of the resource.

> > +     */

> > +    XEN_GUEST_HANDLE(xen_pfn_t) gmfn_list;

> 

> What about a 32-bit x86 tool stack domain as caller here? I

> don't think you want to limit it to just the low 16Tb? Nor are

> you adding a compat translation layer.


Ok. Good point, even though somewhat unlikely.

> 

> > +};

> > +typedef struct xen_mem_acquire_resource

> xen_mem_acquire_resource_t;

> > +

> > +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */

> 

> Also please group this with the other similar section, without

> introducing a second identical #if. After all we have ...

> 

> > +/* Next available subop number is 29 */

> 

> ... this to eliminate the need to have things numerically sorted in

> this file.


Ok.

  Paul

> 

> Jan

> 

> _______________________________________________

> Xen-devel mailing list

> Xen-devel@lists.xen.org

> https://lists.xen.org/xen-devel
Paul Durrant Sept. 25, 2017, 3 p.m. UTC | #4
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 25 September 2017 15:23
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to
> acquire guest resources
> 
> >>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
> > Certain memory resources associated with a guest are not necessarily
> > present in the guest P2M and so are not necessarily available to be
> > foreign-mapped by a tools domain unless they are inserted, which risks
> > shattering a super-page mapping.
> 
> Btw., I'm additionally having trouble seeing this shattering of a
> superpage: For one, xc_core_arch_get_scratch_gpfn() could be
> a little less simplistic. And then even with the currently chosen
> value (outside of the range of valid GFNs at that point in time)
> there shouldn't be a larger page to be shattered, as there should
> be no mapping at all at that index. But perhaps I'm just blind and
> don't see the obvious ...

The shattering was Andrew's observation. Andrew, can you comment?

Even if it's not the case, it's suboptimal to have to write-lock and update the guest's P2M twice just to map a page of grants, which I will mention in the commit comment too.

  Paul

> 
> Jan
Paul Durrant Sept. 26, 2017, 12:20 p.m. UTC | #5
> -----Original Message-----

> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of

> Paul Durrant

> Sent: 25 September 2017 16:00

> To: 'Jan Beulich' <JBeulich@suse.com>

> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-

> devel@lists.xenproject.org

> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add

> HYPERVISOR_memory_op to acquire guest resources

> 

> > -----Original Message-----

> > From: Jan Beulich [mailto:JBeulich@suse.com]

> > Sent: 25 September 2017 15:23

> > To: Paul Durrant <Paul.Durrant@citrix.com>

> > Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-

> > devel@lists.xenproject.org

> > Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to

> > acquire guest resources

> >

> > >>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:

> > > Certain memory resources associated with a guest are not necessarily

> > > present in the guest P2M and so are not necessarily available to be

> > > foreign-mapped by a tools domain unless they are inserted, which risks

> > > shattering a super-page mapping.

> >

> > Btw., I'm additionally having trouble seeing this shattering of a

> > superpage: For one, xc_core_arch_get_scratch_gpfn() could be

> > a little less simplistic. And then even with the currently chosen

> > value (outside of the range of valid GFNs at that point in time)

> > there shouldn't be a larger page to be shattered, as there should

> > be no mapping at all at that index. But perhaps I'm just blind and

> > don't see the obvious ...

> 

> The shattering was Andrew's observation. Andrew, can you comment?

> 


Andrew commented verbally on this. It's not actually a shattering as such... The issue, apparently, is that adding the 4k grant table frame into the guest p2m will potentially cause creation of all layers of page table but removing it again will only remove the L1 entry. Thus it is no longer possible to use a superpage for that mapping at any point subsequently.

  Paul
Jan Beulich Sept. 26, 2017, 12:35 p.m. UTC | #6
>>> On 26.09.17 at 14:20, <Paul.Durrant@citrix.com> wrote:
>>  -----Original Message-----
>> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of
>> Paul Durrant
>> Sent: 25 September 2017 16:00
>> To: 'Jan Beulich' <JBeulich@suse.com>
>> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>> devel@lists.xenproject.org 
>> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add
>> HYPERVISOR_memory_op to acquire guest resources
>> 
>> > -----Original Message-----
>> > From: Jan Beulich [mailto:JBeulich@suse.com]
>> > Sent: 25 September 2017 15:23
>> > To: Paul Durrant <Paul.Durrant@citrix.com>
>> > Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>> > devel@lists.xenproject.org 
>> > Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to
>> > acquire guest resources
>> >
>> > >>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
>> > > Certain memory resources associated with a guest are not necessarily
>> > > present in the guest P2M and so are not necessarily available to be
>> > > foreign-mapped by a tools domain unless they are inserted, which risks
>> > > shattering a super-page mapping.
>> >
>> > Btw., I'm additionally having trouble seeing this shattering of a
>> > superpage: For one, xc_core_arch_get_scratch_gpfn() could be
>> > a little less simplistic. And then even with the currently chosen
>> > value (outside of the range of valid GFNs at that point in time)
>> > there shouldn't be a larger page to be shattered, as there should
>> > be no mapping at all at that index. But perhaps I'm just blind and
>> > don't see the obvious ...
>> 
>> The shattering was Andrew's observation. Andrew, can you comment?
>> 
> 
> Andrew commented verbally on this. It's not actually a shattering as such... 
> The issue, apparently, is that adding the 4k grant table frame into the guest 
> p2m will potentially cause creation of all layers of page table but removing 
> it again will only remove the L1 entry. Thus it is no longer possible to use 
> a superpage for that mapping at any point subsequently.

First of all - what would cause a mapping to appear at that slot (or in
a range covering that slot). And then, while re-combining contiguous
mappings indeed doesn't exist right now, replacing a non-leaf entry
(page table) with a large page is very well supported (see e.g.
ept_set_entry(), which even has a comment to that effect). Hence
I continue to be confused why we need a new mechanism for
seeding the grant tables.

Jan
Paul Durrant Sept. 26, 2017, 12:49 p.m. UTC | #7
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 26 September 2017 13:35
> To: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org
> Subject: RE: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to
> acquire guest resources
> 
> >>> On 26.09.17 at 14:20, <Paul.Durrant@citrix.com> wrote:
> >>  -----Original Message-----
> >> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of
> >> Paul Durrant
> >> Sent: 25 September 2017 16:00
> >> To: 'Jan Beulich' <JBeulich@suse.com>
> >> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
> >> devel@lists.xenproject.org
> >> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add
> >> HYPERVISOR_memory_op to acquire guest resources
> >>
> >> > -----Original Message-----
> >> > From: Jan Beulich [mailto:JBeulich@suse.com]
> >> > Sent: 25 September 2017 15:23
> >> > To: Paul Durrant <Paul.Durrant@citrix.com>
> >> > Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
> >> > devel@lists.xenproject.org
> >> > Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op
> to
> >> > acquire guest resources
> >> >
> >> > >>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
> >> > > Certain memory resources associated with a guest are not necessarily
> >> > > present in the guest P2M and so are not necessarily available to be
> >> > > foreign-mapped by a tools domain unless they are inserted, which
> risks
> >> > > shattering a super-page mapping.
> >> >
> >> > Btw., I'm additionally having trouble seeing this shattering of a
> >> > superpage: For one, xc_core_arch_get_scratch_gpfn() could be
> >> > a little less simplistic. And then even with the currently chosen
> >> > value (outside of the range of valid GFNs at that point in time)
> >> > there shouldn't be a larger page to be shattered, as there should
> >> > be no mapping at all at that index. But perhaps I'm just blind and
> >> > don't see the obvious ...
> >>
> >> The shattering was Andrew's observation. Andrew, can you comment?
> >>
> >
> > Andrew commented verbally on this. It's not actually a shattering as such...
> > The issue, apparently, is that adding the 4k grant table frame into the guest
> > p2m will potentially cause creation of all layers of page table but removing
> > it again will only remove the L1 entry. Thus it is no longer possible to use
> > a superpage for that mapping at any point subsequently.
> 
> First of all - what would cause a mapping to appear at that slot (or in
> a range covering that slot). And then, while re-combining contiguous
> mappings indeed doesn't exist right now, replacing a non-leaf entry
> (page table) with a large page is very well supported (see e.g.
> ept_set_entry(), which even has a comment to that effect). Hence
> I continue to be confused why we need a new mechanism for
> seeding the grant tables.

I'll have to defer to Andrew to answer at this point.

  Paul

> 
> Jan
Andrew Cooper Sept. 27, 2017, 11:34 a.m. UTC | #8
On 26/09/17 13:49, Paul Durrant wrote:
>> -----Original Message-----
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 26 September 2017 13:35
>> To: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
>> <Paul.Durrant@citrix.com>
>> Cc: xen-devel@lists.xenproject.org
>> Subject: RE: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to
>> acquire guest resources
>>
>>>>> On 26.09.17 at 14:20, <Paul.Durrant@citrix.com> wrote:
>>>>  -----Original Message-----
>>>> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of
>>>> Paul Durrant
>>>> Sent: 25 September 2017 16:00
>>>> To: 'Jan Beulich' <JBeulich@suse.com>
>>>> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>>>> devel@lists.xenproject.org
>>>> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add
>>>> HYPERVISOR_memory_op to acquire guest resources
>>>>
>>>>> -----Original Message-----
>>>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>>>> Sent: 25 September 2017 15:23
>>>>> To: Paul Durrant <Paul.Durrant@citrix.com>
>>>>> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>>>>> devel@lists.xenproject.org
>>>>> Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op
>> to
>>>>> acquire guest resources
>>>>>
>>>>>>>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
>>>>>> Certain memory resources associated with a guest are not necessarily
>>>>>> present in the guest P2M and so are not necessarily available to be
>>>>>> foreign-mapped by a tools domain unless they are inserted, which
>> risks
>>>>>> shattering a super-page mapping.
>>>>> Btw., I'm additionally having trouble seeing this shattering of a
>>>>> superpage: For one, xc_core_arch_get_scratch_gpfn() could be
>>>>> a little less simplistic. And then even with the currently chosen
>>>>> value (outside of the range of valid GFNs at that point in time)
>>>>> there shouldn't be a larger page to be shattered, as there should
>>>>> be no mapping at all at that index. But perhaps I'm just blind and
>>>>> don't see the obvious ...
>>>> The shattering was Andrew's observation. Andrew, can you comment?
>>>>
>>> Andrew commented verbally on this. It's not actually a shattering as such...
>>> The issue, apparently, is that adding the 4k grant table frame into the guest
>>> p2m will potentially cause creation of all layers of page table but removing
>>> it again will only remove the L1 entry. Thus it is no longer possible to use
>>> a superpage for that mapping at any point subsequently.
>> First of all - what would cause a mapping to appear at that slot (or in
>> a range covering that slot).

???

At the moment, the toolstack's *only* way of editing the grant table of
an HVM guest is to add it into the p2m, map the gfn, write two values,
and unmap it.  That is how a 4k mapping gets added, which forces an
allocation or shattering to cause a L1 table to exist.

This is a kludge and is architecturally unclean.

>>  And then, while re-combining contiguous
>> mappings indeed doesn't exist right now, replacing a non-leaf entry
>> (page table) with a large page is very well supported (see e.g.
>> ept_set_entry(), which even has a comment to that effect).

I don't see anything equivalent in the NPT or IOMMU logic.

>>  Hence
>> I continue to be confused why we need a new mechanism for
>> seeding the grant tables.
> I'll have to defer to Andrew to answer at this point.

Joao's improvements for network transmit require a trusted backend to be
able to map the full grant table.

~Andrew
Jan Beulich Sept. 27, 2017, 12:56 p.m. UTC | #9
>>> On 27.09.17 at 13:34, <andrew.cooper3@citrix.com> wrote:
> On 26/09/17 13:49, Paul Durrant wrote:
>>> -----Original Message-----
>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>> Sent: 26 September 2017 13:35
>>> To: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
>>> <Paul.Durrant@citrix.com>
>>> Cc: xen-devel@lists.xenproject.org 
>>> Subject: RE: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op to
>>> acquire guest resources
>>>
>>>>>> On 26.09.17 at 14:20, <Paul.Durrant@citrix.com> wrote:
>>>>>  -----Original Message-----
>>>>> From: Xen-devel [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of
>>>>> Paul Durrant
>>>>> Sent: 25 September 2017 16:00
>>>>> To: 'Jan Beulich' <JBeulich@suse.com>
>>>>> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>>>>> devel@lists.xenproject.org 
>>>>> Subject: Re: [Xen-devel] [PATCH v7 02/12] x86/mm: add
>>>>> HYPERVISOR_memory_op to acquire guest resources
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>>>>> Sent: 25 September 2017 15:23
>>>>>> To: Paul Durrant <Paul.Durrant@citrix.com>
>>>>>> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; xen-
>>>>>> devel@lists.xenproject.org 
>>>>>> Subject: Re: [PATCH v7 02/12] x86/mm: add HYPERVISOR_memory_op
>>> to
>>>>>> acquire guest resources
>>>>>>
>>>>>>>>> On 18.09.17 at 17:31, <paul.durrant@citrix.com> wrote:
>>>>>>> Certain memory resources associated with a guest are not necessarily
>>>>>>> present in the guest P2M and so are not necessarily available to be
>>>>>>> foreign-mapped by a tools domain unless they are inserted, which
>>> risks
>>>>>>> shattering a super-page mapping.
>>>>>> Btw., I'm additionally having trouble seeing this shattering of a
>>>>>> superpage: For one, xc_core_arch_get_scratch_gpfn() could be
>>>>>> a little less simplistic. And then even with the currently chosen
>>>>>> value (outside of the range of valid GFNs at that point in time)
>>>>>> there shouldn't be a larger page to be shattered, as there should
>>>>>> be no mapping at all at that index. But perhaps I'm just blind and
>>>>>> don't see the obvious ...
>>>>> The shattering was Andrew's observation. Andrew, can you comment?
>>>>>
>>>> Andrew commented verbally on this. It's not actually a shattering as such...
>>>> The issue, apparently, is that adding the 4k grant table frame into the guest
>>>> p2m will potentially cause creation of all layers of page table but removing
>>>> it again will only remove the L1 entry. Thus it is no longer possible to use
>>>> a superpage for that mapping at any point subsequently.
>>> First of all - what would cause a mapping to appear at that slot (or in
>>> a range covering that slot).
> 
> ???
> 
> At the moment, the toolstack's *only* way of editing the grant table of
> an HVM guest is to add it into the p2m, map the gfn, write two values,
> and unmap it.  That is how a 4k mapping gets added, which forces an
> allocation or shattering to cause a L1 table to exist.
> 
> This is a kludge and is architecturally unclean.

Well, if the grant table related parts of series here was presented
to be simply cleaning up a kludge, I'd probably be fine. But so far
it has been claimed that there are other bad effects, besides this
just being (as I would call it) sub-optimal.

>>>  And then, while re-combining contiguous
>>> mappings indeed doesn't exist right now, replacing a non-leaf entry
>>> (page table) with a large page is very well supported (see e.g.
>>> ept_set_entry(), which even has a comment to that effect).
> 
> I don't see anything equivalent in the NPT or IOMMU logic.

Look for intermediate_entry in p2m_pt_set_entry(). In AMD
IOMMU code see iommu_merge_pages(). For VT-d I agree.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index cb0189efae..c8f50f3bb0 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4768,6 +4768,107 @@  int xenmem_add_to_physmap_one(
     return rc;
 }
 
+static int xenmem_acquire_grant_table(struct domain *d,
+                                      unsigned long frame,
+                                      unsigned long nr_frames,
+                                      unsigned long mfn_list[])
+{
+    unsigned int i;
+
+    /*
+     * Iterate through the list backwards so that gnttab_get_frame() is
+     * first called for the highest numbered frame. This means that the
+     * out-of-bounds check will be done on the first iteration and, if
+     * the table needs to grow, it will only grow once.
+     */
+    i = nr_frames;
+    while ( i-- != 0 )
+    {
+        mfn_t mfn = gnttab_get_frame(d, frame + i);
+
+        if ( mfn_eq(mfn, INVALID_MFN) )
+            return -EINVAL;
+
+        mfn_list[i] = mfn_x(mfn);
+    }
+
+    return 0;
+}
+
+static int xenmem_acquire_resource(xen_mem_acquire_resource_t *xmar)
+{
+    struct domain *d, *currd = current->domain;
+    unsigned long *mfn_list;
+    int rc;
+
+    if ( xmar->nr_frames == 0 )
+        return -EINVAL;
+
+    d = rcu_lock_domain_by_any_id(xmar->domid);
+    if ( d == NULL )
+        return -ESRCH;
+
+    rc = xsm_domain_memory_map(XSM_TARGET, d);
+    if ( rc )
+        goto out;
+
+    mfn_list = xmalloc_array(unsigned long, xmar->nr_frames);
+
+    rc = -ENOMEM;
+    if ( !mfn_list )
+        goto out;
+
+    switch ( xmar->type )
+    {
+    case XENMEM_resource_grant_table:
+        rc = -EINVAL;
+        if ( xmar->id ) /* must be zero for grant_table */
+            break;
+
+        rc = xenmem_acquire_grant_table(d, xmar->frame, xmar->nr_frames,
+                                        mfn_list);
+        break;
+
+    default:
+        rc = -EOPNOTSUPP;
+        break;
+    }
+
+    if ( rc )
+        goto free_and_out;
+
+    if ( !paging_mode_translate(currd) )
+    {
+        if ( copy_to_guest_offset(xmar->gmfn_list, 0, mfn_list,
+                                  xmar->nr_frames) )
+            rc = -EFAULT;
+    }
+    else
+    {
+        unsigned int i;
+
+        for ( i = 0; i < xmar->nr_frames; i++ )
+        {
+            xen_pfn_t gfn;
+
+            rc = -EFAULT;
+            if ( copy_from_guest_offset(&gfn, xmar->gmfn_list, i, 1) )
+                goto free_and_out;
+
+            rc = set_foreign_p2m_entry(currd, gfn, _mfn(mfn_list[i]));
+            if ( rc )
+                goto free_and_out;
+        }
+    }
+
+ free_and_out:
+    xfree(mfn_list);
+
+ out:
+    rcu_unlock_domain(d);
+    return rc;
+}
+
 long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     int rc;
@@ -4990,6 +5091,16 @@  long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         return rc;
     }
 
+    case XENMEM_acquire_resource:
+    {
+        xen_mem_acquire_resource_t xmar;
+
+        if ( copy_from_guest(&xmar, arg, 1) )
+            return -EFAULT;
+
+        return xenmem_acquire_resource(&xmar);
+    }
+
     default:
         return subarch_memory_op(cmd, arg);
     }
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 0b479105b9..d0f8fc249b 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1121,8 +1121,7 @@  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 }
 
 /* Set foreign mfn in the given guest's p2m table. */
-static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
-                                 mfn_t mfn)
+int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
 {
     return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 9a4d335ee0..dfd00a9432 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -3607,38 +3607,58 @@  int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref,
 }
 #endif
 
-int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn,
-                     mfn_t *mfn)
-{
-    int rc = 0;
 
-    grant_write_lock(d->grant_table);
+static mfn_t gnttab_get_frame_locked(struct domain *d, unsigned long idx)
+{
+    struct grant_table *gt = d->grant_table;
+    mfn_t mfn = INVALID_MFN;
 
-    if ( d->grant_table->gt_version == 0 )
-        d->grant_table->gt_version = 1;
+    if ( gt->gt_version == 0 )
+        gt->gt_version = 1;
 
-    if ( d->grant_table->gt_version == 2 &&
+    if ( gt->gt_version == 2 &&
          (idx & XENMAPIDX_grant_table_status) )
     {
         idx &= ~XENMAPIDX_grant_table_status;
-        if ( idx < nr_status_frames(d->grant_table) )
-            *mfn = _mfn(virt_to_mfn(d->grant_table->status[idx]));
-        else
-            rc = -EINVAL;
+        if ( idx < nr_status_frames(gt) )
+            mfn = _mfn(virt_to_mfn(gt->status[idx]));
     }
     else
     {
-        if ( (idx >= nr_grant_frames(d->grant_table)) &&
+        if ( (idx >= nr_grant_frames(gt)) &&
              (idx < max_grant_frames) )
             gnttab_grow_table(d, idx + 1);
 
-        if ( idx < nr_grant_frames(d->grant_table) )
-            *mfn = _mfn(virt_to_mfn(d->grant_table->shared_raw[idx]));
-        else
-            rc = -EINVAL;
+        if ( idx < nr_grant_frames(gt) )
+            mfn = _mfn(virt_to_mfn(gt->shared_raw[idx]));
     }
 
-    gnttab_set_frame_gfn(d, idx, gfn);
+    return mfn;
+}
+
+mfn_t gnttab_get_frame(struct domain *d, unsigned long idx)
+{
+    mfn_t mfn;
+
+    grant_write_lock(d->grant_table);
+    mfn = gnttab_get_frame_locked(d, idx);
+    grant_write_unlock(d->grant_table);
+
+    return mfn;
+}
+
+int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn,
+                     mfn_t *mfn)
+{
+    int rc = 0;
+
+    grant_write_lock(d->grant_table);
+
+    *mfn = gnttab_get_frame_locked(d, idx);
+    if ( mfn_eq(*mfn, INVALID_MFN) )
+        rc = -EINVAL;
+    else
+        gnttab_set_frame_gfn(d, idx, gfn);
 
     grant_write_unlock(d->grant_table);
 
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 10cdfc09a9..4eff0458bc 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -613,6 +613,9 @@  void p2m_memory_type_changed(struct domain *d);
 int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start,
                           unsigned long end);
 
+/* Set foreign entry in the p2m table (for priv-mapping) */
+int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
                        unsigned int order, p2m_access_t access);
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 29386df98b..9bf58e7384 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -650,7 +650,43 @@  struct xen_vnuma_topology_info {
 typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t;
 DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t);
 
-/* Next available subop number is 28 */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/*
+ * Get the pages for a particular guest resource, so that they can be
+ * mapped directly by a tools domain.
+ */
+#define XENMEM_acquire_resource 28
+struct xen_mem_acquire_resource {
+    /* IN - the domain whose resource is to be mapped */
+    domid_t domid;
+    /* IN - the type of resource (defined below) */
+    uint16_t type;
+
+#define XENMEM_resource_grant_table 0
+
+    /*
+     * IN - a type-specific resource identifier, which must be zero
+     *      unless stated otherwise.
+     */
+    uint32_t id;
+    /* IN - number of (4K) frames of the resource to be mapped */
+    uint32_t nr_frames;
+    /* IN - the index of the initial frame to be mapped */
+    uint64_aligned_t frame;
+    /* IN/OUT - If the tools domain is PV then, upon return, gmfn_list
+     *          will be populated with the MFNs of the resource.
+     *          If the tools domain is HVM then it is expected that, on
+     *          entry, gmfn_list will be populated with a list of GFNs
+     *          that will be mapped to the MFNs of the resource.
+     */
+    XEN_GUEST_HANDLE(xen_pfn_t) gmfn_list;
+};
+typedef struct xen_mem_acquire_resource xen_mem_acquire_resource_t;
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+/* Next available subop number is 29 */
 
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 43ec6c4d80..f9e89375bb 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -136,6 +136,7 @@  static inline unsigned int grant_to_status_frames(int grant_frames)
 int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref,
                             gfn_t *gfn, uint16_t *status);
 
+mfn_t gnttab_get_frame(struct domain *d, unsigned long idx);
 int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn,
                      mfn_t *mfn);