diff mbox series

[v2,19/20] x86/mem_sharing: reset a fork

Message ID 22a6cb7bd5593ed38cf6f66c26c4734a04718e1a.1576697796.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel Dec. 18, 2019, 7:40 p.m. UTC
Implement hypercall that allows a fork to shed all memory that got allocated
for it during its execution and re-load its vCPU context from the parent VM.
This allows the forked VM to reset into the same state the parent VM is in a
faster way then creating a new fork would be. Measurements show about a 2x
speedup during normal fuzzing operations. Performance may vary depending how
much memory got allocated for the forked VM. If it has been completely
deduplicated from the parent VM then creating a new fork would likely be more
performant.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
 xen/include/public/memory.h   |   1 +
 2 files changed, 106 insertions(+)

Comments

Julien Grall Dec. 18, 2019, 10 p.m. UTC | #1
Hi Tamas,

On 18/12/2019 19:40, Tamas K Lengyel wrote:
> Implement hypercall that allows a fork to shed all memory that got allocated
> for it during its execution and re-load its vCPU context from the parent VM.
> This allows the forked VM to reset into the same state the parent VM is in a
> faster way then creating a new fork would be. Measurements show about a 2x
> speedup during normal fuzzing operations. Performance may vary depending how
> much memory got allocated for the forked VM. If it has been completely
> deduplicated from the parent VM then creating a new fork would likely be more
> performant.
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> ---
>   xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
>   xen/include/public/memory.h   |   1 +
>   2 files changed, 106 insertions(+)
> 
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index e93ad2ec5a..4735a334b9 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1622,6 +1622,87 @@ static int mem_sharing_fork(struct domain *d, struct domain *cd)
>       return 0;
>   }
>   
> +struct gfn_free;
> +struct gfn_free {
> +    struct gfn_free *next;
> +    struct page_info *page;
> +    gfn_t gfn;
> +};
> +
> +static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
> +{
> +    int rc;
> +
> +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
> +    struct gfn_free *list = NULL;
> +    struct page_info *page;
> +
> +    page_list_for_each(page, &cd->page_list)

AFAICT, your domain is not paused, so it would be possible to have page 
added/remove in that list behind your back.

You also have multiple loop on the page_list in this function. Given the 
number of page_list can be quite big, this is a call for hogging the 
pCPU and an RCU lock on the domain vCPU running this call.

> +    {
> +        mfn_t mfn = page_to_mfn(page);
> +        if ( mfn_valid(mfn) )
> +        {
> +            p2m_type_t p2mt;
> +            p2m_access_t p2ma;
> +            gfn_t gfn = mfn_to_gfn(cd, mfn);
> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> +                                        0, NULL, false);
> +            if ( p2m_is_ram(p2mt) )
> +            {
> +                struct gfn_free *gfn_free;
> +                if ( !get_page(page, cd) )
> +                    goto err_reset;
> +
> +                /*
> +                 * We can't free the page while iterating over the page_list
> +                 * so we build a separate list to loop over.
> +                 *
> +                 * We want to iterate over the page_list instead of checking
> +                 * gfn from 0 to max_gfn because this is ~10x faster.
> +                 */
> +                gfn_free = xmalloc(struct gfn_free);

If I did the math right, for a 4G guest this will require at ~24MB of 
memory. Actually, is it really necessary to do the allocation for a 
short period of time?

What are you trying to achieve by iterating twice on the GFN? Wouldn't 
it be easier to pause the domain?

> +                if ( !gfn_free )
> +                    goto err_reset;
> +
> +                gfn_free->gfn = gfn;
> +                gfn_free->page = page;
> +                gfn_free->next = list;
> +                list = gfn_free;
> +            }
> +        }
> +    }
> +
> +    while ( list )
> +    {
> +        struct gfn_free *next = list->next;
> +
> +        rc = p2m->set_entry(p2m, list->gfn, INVALID_MFN, PAGE_ORDER_4K,
> +                            p2m_invalid, p2m_access_rwx, -1);
> +        put_page_alloc_ref(list->page);
> +        put_page(list->page);
> +
> +        xfree(list);
> +        list = next;
> +
> +        ASSERT(!rc);
> +    }
> +
> +    if ( (rc = fork_hvm(d, cd)) )
> +        return rc;
> +
> + err_reset:
> +    while ( list )
> +    {
> +        struct gfn_free *next = list->next;
> +
> +        put_page(list->page);
> +        xfree(list);
> +        list = next;
> +    }
> +
> +    return 0;
> +}
> +

Cheers,
Tamas K Lengyel Dec. 18, 2019, 10:33 p.m. UTC | #2
On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
>
> Hi Tamas,
>
> On 18/12/2019 19:40, Tamas K Lengyel wrote:
> > Implement hypercall that allows a fork to shed all memory that got allocated
> > for it during its execution and re-load its vCPU context from the parent VM.
> > This allows the forked VM to reset into the same state the parent VM is in a
> > faster way then creating a new fork would be. Measurements show about a 2x
> > speedup during normal fuzzing operations. Performance may vary depending how
> > much memory got allocated for the forked VM. If it has been completely
> > deduplicated from the parent VM then creating a new fork would likely be more
> > performant.
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > ---
> >   xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
> >   xen/include/public/memory.h   |   1 +
> >   2 files changed, 106 insertions(+)
> >
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index e93ad2ec5a..4735a334b9 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1622,6 +1622,87 @@ static int mem_sharing_fork(struct domain *d, struct domain *cd)
> >       return 0;
> >   }
> >
> > +struct gfn_free;
> > +struct gfn_free {
> > +    struct gfn_free *next;
> > +    struct page_info *page;
> > +    gfn_t gfn;
> > +};
> > +
> > +static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
> > +{
> > +    int rc;
> > +
> > +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
> > +    struct gfn_free *list = NULL;
> > +    struct page_info *page;
> > +
> > +    page_list_for_each(page, &cd->page_list)
>
> AFAICT, your domain is not paused, so it would be possible to have page
> added/remove in that list behind your back.

Well, it's not that it's not paused, it's just that I haven't added a
sanity check to make sure it is. The toolstack can (and should) pause
it, so that sanity check would be warranted.

>
> You also have multiple loop on the page_list in this function. Given the
> number of page_list can be quite big, this is a call for hogging the
> pCPU and an RCU lock on the domain vCPU running this call.

There is just one loop over page_list itself, the second loop is on
the internal list that is being built here which will be a subset. The
list itself in fact should be small (in our tests usually <100).
Granted the list can grow larger, but in those cases its likely better
to just discard the fork and create a new one. So in my opinion adding
a hypercall continuation to this not needed.

>
> > +    {
> > +        mfn_t mfn = page_to_mfn(page);
> > +        if ( mfn_valid(mfn) )
> > +        {
> > +            p2m_type_t p2mt;
> > +            p2m_access_t p2ma;
> > +            gfn_t gfn = mfn_to_gfn(cd, mfn);
> > +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> > +                                        0, NULL, false);
> > +            if ( p2m_is_ram(p2mt) )
> > +            {
> > +                struct gfn_free *gfn_free;
> > +                if ( !get_page(page, cd) )
> > +                    goto err_reset;
> > +
> > +                /*
> > +                 * We can't free the page while iterating over the page_list
> > +                 * so we build a separate list to loop over.
> > +                 *
> > +                 * We want to iterate over the page_list instead of checking
> > +                 * gfn from 0 to max_gfn because this is ~10x faster.
> > +                 */
> > +                gfn_free = xmalloc(struct gfn_free);
>
> If I did the math right, for a 4G guest this will require at ~24MB of
> memory. Actually, is it really necessary to do the allocation for a
> short period of time?

If you have a fully deduplicated fork then you should not be using
this function to begin with. You get better performance my throwing
that one away and creating a new one. As for using xmalloc here, I'm
not sure what other way I have to build a list of pages that need to
be freed. I can't free the page itself while I'm iterating on
page_list (that I'm aware of). The only other option available is
calling __get_gfn_type_access with gfn=0..max_gfn which will be
extremely slow because you have to loop over a lot of holes.

>
> What are you trying to achieve by iterating twice on the GFN? Wouldn't
> it be easier to pause the domain?

I'm not sure what you mean, where do you see me iterating twice on the
gfn? And what does pausing have to do with it?

Than
Julien Grall Dec. 18, 2019, 11:01 p.m. UTC | #3
Hi,

On 18/12/2019 22:33, Tamas K Lengyel wrote:
> On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
>>
>> Hi Tamas,
>>
>> On 18/12/2019 19:40, Tamas K Lengyel wrote:
>>> Implement hypercall that allows a fork to shed all memory that got allocated
>>> for it during its execution and re-load its vCPU context from the parent VM.
>>> This allows the forked VM to reset into the same state the parent VM is in a
>>> faster way then creating a new fork would be. Measurements show about a 2x
>>> speedup during normal fuzzing operations. Performance may vary depending how
>>> much memory got allocated for the forked VM. If it has been completely
>>> deduplicated from the parent VM then creating a new fork would likely be more
>>> performant.
>>>
>>> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
>>> ---
>>>    xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
>>>    xen/include/public/memory.h   |   1 +
>>>    2 files changed, 106 insertions(+)
>>>
>>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>>> index e93ad2ec5a..4735a334b9 100644
>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>> @@ -1622,6 +1622,87 @@ static int mem_sharing_fork(struct domain *d, struct domain *cd)
>>>        return 0;
>>>    }
>>>
>>> +struct gfn_free;
>>> +struct gfn_free {
>>> +    struct gfn_free *next;
>>> +    struct page_info *page;
>>> +    gfn_t gfn;
>>> +};
>>> +
>>> +static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
>>> +{
>>> +    int rc;
>>> +
>>> +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
>>> +    struct gfn_free *list = NULL;
>>> +    struct page_info *page;
>>> +
>>> +    page_list_for_each(page, &cd->page_list)
>>
>> AFAICT, your domain is not paused, so it would be possible to have page
>> added/remove in that list behind your back.
> 
> Well, it's not that it's not paused, it's just that I haven't added a
> sanity check to make sure it is. The toolstack can (and should) pause
> it, so that sanity check would be warranted.
I have only read the hypervisor part, so I didn't know what the 
toolstack has done.

> 
>>
>> You also have multiple loop on the page_list in this function. Given the
>> number of page_list can be quite big, this is a call for hogging the
>> pCPU and an RCU lock on the domain vCPU running this call.
> 
> There is just one loop over page_list itself, the second loop is on
> the internal list that is being built here which will be a subset. The
> list itself in fact should be small (in our tests usually <100).

For a first, nothing in this function tells me that there will be only 
100 pages. But then, I don't think this is right to implement your 
hypercall based only the  "normal" scenario. You should also think about 
the "worst" case scenario.

In this case the worst case scenario is have hundreds of page in page_list.

> Granted the list can grow larger, but in those cases its likely better
> to just discard the fork and create a new one. So in my opinion adding
> a hypercall continuation to this not needed

How would the caller know it? What would happen if the caller ends up to 
call this with a growing list.

> 
>>
>>> +    {
>>> +        mfn_t mfn = page_to_mfn(page);
>>> +        if ( mfn_valid(mfn) )
>>> +        {
>>> +            p2m_type_t p2mt;
>>> +            p2m_access_t p2ma;
>>> +            gfn_t gfn = mfn_to_gfn(cd, mfn);
>>> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
>>> +                                        0, NULL, false);
>>> +            if ( p2m_is_ram(p2mt) )
>>> +            {
>>> +                struct gfn_free *gfn_free;
>>> +                if ( !get_page(page, cd) )
>>> +                    goto err_reset;
>>> +
>>> +                /*
>>> +                 * We can't free the page while iterating over the page_list
>>> +                 * so we build a separate list to loop over.
>>> +                 *
>>> +                 * We want to iterate over the page_list instead of checking
>>> +                 * gfn from 0 to max_gfn because this is ~10x faster.
>>> +                 */
>>> +                gfn_free = xmalloc(struct gfn_free);
>>
>> If I did the math right, for a 4G guest this will require at ~24MB of
>> memory. Actually, is it really necessary to do the allocation for a
>> short period of time?
> 
> If you have a fully deduplicated fork then you should not be using
> this function to begin with. You get better performance my throwing
> that one away and creating a new one.

How a user knows when/how this can be called? But then, as said above, 
this may be called by mistake... So I still think you need to be prepare 
for the worst case.

> As for using xmalloc here, I'm
> not sure what other way I have to build a list of pages that need to
> be freed. I can't free the page itself while I'm iterating on
> page_list (that I'm aware of). The only other option available is
> calling __get_gfn_type_access with gfn=0..max_gfn which will be
> extremely slow because you have to loop over a lot of holes.
You can use page_list_for_each_safe(). This is already used by function 
such as relinquish_memory().

> 
>>
>> What are you trying to achieve by iterating twice on the GFN? Wouldn't
>> it be easier to pause the domain?
> 
> I'm not sure what you mean, where do you see me iterating twice on the
> gfn? And what does pausing have to do with it?

It was unclear why you decided to use a double loop here. You explained 
it above, so this can be discarded.

Cheers,
Tamas K Lengyel Dec. 19, 2019, 12:15 a.m. UTC | #4
On Wed, Dec 18, 2019 at 4:02 PM Julien Grall <julien@xen.org> wrote:
>
> Hi,
>
> On 18/12/2019 22:33, Tamas K Lengyel wrote:
> > On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
> >>
> >> Hi Tamas,
> >>
> >> On 18/12/2019 19:40, Tamas K Lengyel wrote:
> >>> Implement hypercall that allows a fork to shed all memory that got allocated
> >>> for it during its execution and re-load its vCPU context from the parent VM.
> >>> This allows the forked VM to reset into the same state the parent VM is in a
> >>> faster way then creating a new fork would be. Measurements show about a 2x
> >>> speedup during normal fuzzing operations. Performance may vary depending how
> >>> much memory got allocated for the forked VM. If it has been completely
> >>> deduplicated from the parent VM then creating a new fork would likely be more
> >>> performant.
> >>>
> >>> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> >>> ---
> >>>    xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
> >>>    xen/include/public/memory.h   |   1 +
> >>>    2 files changed, 106 insertions(+)
> >>>
> >>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> >>> index e93ad2ec5a..4735a334b9 100644
> >>> --- a/xen/arch/x86/mm/mem_sharing.c
> >>> +++ b/xen/arch/x86/mm/mem_sharing.c
> >>> @@ -1622,6 +1622,87 @@ static int mem_sharing_fork(struct domain *d, struct domain *cd)
> >>>        return 0;
> >>>    }
> >>>
> >>> +struct gfn_free;
> >>> +struct gfn_free {
> >>> +    struct gfn_free *next;
> >>> +    struct page_info *page;
> >>> +    gfn_t gfn;
> >>> +};
> >>> +
> >>> +static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
> >>> +{
> >>> +    int rc;
> >>> +
> >>> +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
> >>> +    struct gfn_free *list = NULL;
> >>> +    struct page_info *page;
> >>> +
> >>> +    page_list_for_each(page, &cd->page_list)
> >>
> >> AFAICT, your domain is not paused, so it would be possible to have page
> >> added/remove in that list behind your back.
> >
> > Well, it's not that it's not paused, it's just that I haven't added a
> > sanity check to make sure it is. The toolstack can (and should) pause
> > it, so that sanity check would be warranted.
> I have only read the hypervisor part, so I didn't know what the
> toolstack has done.

I've added the same enforced VM paused operation that is present for
the fork hypercall handler.

>
> >
> >>
> >> You also have multiple loop on the page_list in this function. Given the
> >> number of page_list can be quite big, this is a call for hogging the
> >> pCPU and an RCU lock on the domain vCPU running this call.
> >
> > There is just one loop over page_list itself, the second loop is on
> > the internal list that is being built here which will be a subset. The
> > list itself in fact should be small (in our tests usually <100).
>
> For a first, nothing in this function tells me that there will be only
> 100 pages. But then, I don't think this is right to implement your
> hypercall based only the  "normal" scenario. You should also think about
> the "worst" case scenario.
>
> In this case the worst case scenario is have hundreds of page in page_list.

Well, this is only an experimental system that's completely disabled
by default. Making the assumption that people who make use of it will
know what they are doing I think is fair.

>
> > Granted the list can grow larger, but in those cases its likely better
> > to just discard the fork and create a new one. So in my opinion adding
> > a hypercall continuation to this not needed
>
> How would the caller know it? What would happen if the caller ends up to
> call this with a growing list.

The caller knows by virtue of knowing how long the VM was executed
for. In the usecase this is targeted at the VM was executing only for
a couple seconds at most. Usually much less then that (we get about
~80 resets/s with AFL). During that time its extremely unlikely you
get more then a ~100 pages deduplicated (that is, written to). But
even if there are more pages, it just means the hypercall might take a
bit longer to run for that iteration. I don't see any issue with not
breaking up this hypercall with continuation even under the worst case
situation though. But if others feel that strongly as well about
having to have continuation for this I don't really mind adding it.

>
> >
> >>
> >>> +    {
> >>> +        mfn_t mfn = page_to_mfn(page);
> >>> +        if ( mfn_valid(mfn) )
> >>> +        {
> >>> +            p2m_type_t p2mt;
> >>> +            p2m_access_t p2ma;
> >>> +            gfn_t gfn = mfn_to_gfn(cd, mfn);
> >>> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> >>> +                                        0, NULL, false);
> >>> +            if ( p2m_is_ram(p2mt) )
> >>> +            {
> >>> +                struct gfn_free *gfn_free;
> >>> +                if ( !get_page(page, cd) )
> >>> +                    goto err_reset;
> >>> +
> >>> +                /*
> >>> +                 * We can't free the page while iterating over the page_list
> >>> +                 * so we build a separate list to loop over.
> >>> +                 *
> >>> +                 * We want to iterate over the page_list instead of checking
> >>> +                 * gfn from 0 to max_gfn because this is ~10x faster.
> >>> +                 */
> >>> +                gfn_free = xmalloc(struct gfn_free);
> >>
> >> If I did the math right, for a 4G guest this will require at ~24MB of
> >> memory. Actually, is it really necessary to do the allocation for a
> >> short period of time?
> >
> > If you have a fully deduplicated fork then you should not be using
> > this function to begin with. You get better performance my throwing
> > that one away and creating a new one.
>
> How a user knows when/how this can be called? But then, as said above,
> this may be called by mistake... So I still think you need to be prepare
> for the worst case.

See my answer above.

>
> > As for using xmalloc here, I'm
> > not sure what other way I have to build a list of pages that need to
> > be freed. I can't free the page itself while I'm iterating on
> > page_list (that I'm aware of). The only other option available is
> > calling __get_gfn_type_access with gfn=0..max_gfn which will be
> > extremely slow because you have to loop over a lot of holes.
> You can use page_list_for_each_safe(). This is already used by function
> such as relinquish_memory().

Neat, will check it out! Would certainly simplify things not having to
build a private list.

>
> >
> >>
> >> What are you trying to achieve by iterating twice on the GFN? Wouldn't
> >> it be easier to pause the domain?
> >
> > I'm not sure what you mean, where do you see me iterating twice on the
> > gfn? And what does pausing have to do with it?
>
> It was unclear why you decided to use a double loop here. You explained
> it above, so this can be discarded.

OK, cool.

Thanks,
Tamas
Julien Grall Dec. 19, 2019, 7:45 a.m. UTC | #5
Hi Tamas,

On 19/12/2019 00:15, Tamas K Lengyel wrote:
> On Wed, Dec 18, 2019 at 4:02 PM Julien Grall <julien@xen.org> wrote:
>>
>> Hi,
>>
>> On 18/12/2019 22:33, Tamas K Lengyel wrote:
>>> On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
>>>>
>>>> Hi Tamas,
>>>>
>>>> On 18/12/2019 19:40, Tamas K Lengyel wrote:
>>>>> Implement hypercall that allows a fork to shed all memory that got allocated
>>>>> for it during its execution and re-load its vCPU context from the parent VM.
>>>>> This allows the forked VM to reset into the same state the parent VM is in a
>>>>> faster way then creating a new fork would be. Measurements show about a 2x
>>>>> speedup during normal fuzzing operations. Performance may vary depending how
>>>>> much memory got allocated for the forked VM. If it has been completely
>>>>> deduplicated from the parent VM then creating a new fork would likely be more
>>>>> performant.
>>>>>
>>>>> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
>>>>> ---
>>>>>     xen/arch/x86/mm/mem_sharing.c | 105 ++++++++++++++++++++++++++++++++++
>>>>>     xen/include/public/memory.h   |   1 +
>>>>>     2 files changed, 106 insertions(+)
>>>>>
>>>>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>>>>> index e93ad2ec5a..4735a334b9 100644
>>>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>>>> @@ -1622,6 +1622,87 @@ static int mem_sharing_fork(struct domain *d, struct domain *cd)
>>>>>         return 0;
>>>>>     }
>>>>>
>>>>> +struct gfn_free;
>>>>> +struct gfn_free {
>>>>> +    struct gfn_free *next;
>>>>> +    struct page_info *page;
>>>>> +    gfn_t gfn;
>>>>> +};
>>>>> +
>>>>> +static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
>>>>> +{
>>>>> +    int rc;
>>>>> +
>>>>> +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
>>>>> +    struct gfn_free *list = NULL;
>>>>> +    struct page_info *page;
>>>>> +
>>>>> +    page_list_for_each(page, &cd->page_list)
>>>>
>>>> AFAICT, your domain is not paused, so it would be possible to have page
>>>> added/remove in that list behind your back.
>>>
>>> Well, it's not that it's not paused, it's just that I haven't added a
>>> sanity check to make sure it is. The toolstack can (and should) pause
>>> it, so that sanity check would be warranted.
>> I have only read the hypervisor part, so I didn't know what the
>> toolstack has done.
> 
> I've added the same enforced VM paused operation that is present for
> the fork hypercall handler.
> 
>>
>>>
>>>>
>>>> You also have multiple loop on the page_list in this function. Given the
>>>> number of page_list can be quite big, this is a call for hogging the
>>>> pCPU and an RCU lock on the domain vCPU running this call.
>>>
>>> There is just one loop over page_list itself, the second loop is on
>>> the internal list that is being built here which will be a subset. The
>>> list itself in fact should be small (in our tests usually <100).
>>
>> For a first, nothing in this function tells me that there will be only
>> 100 pages. But then, I don't think this is right to implement your
>> hypercall based only the  "normal" scenario. You should also think about
>> the "worst" case scenario.
>>
>> In this case the worst case scenario is have hundreds of page in page_list.
> 
> Well, this is only an experimental system that's completely disabled
> by default. Making the assumption that people who make use of it will
> know what they are doing I think is fair.

I assume that if you submit to upstream this new hypercall then there is 
longer plan to have more people to use it and potentially making 
"stable". If not, then it raises the question why this is pushed upstream...

In any case, all the known assumptions should be documented so they can 
be fixed rather than forgotten until it is rediscovered via an XSA.

> 
>>
>>> Granted the list can grow larger, but in those cases its likely better
>>> to just discard the fork and create a new one. So in my opinion adding
>>> a hypercall continuation to this not needed
>>
>> How would the caller know it? What would happen if the caller ends up to
>> call this with a growing list.
> 
> The caller knows by virtue of knowing how long the VM was executed
> for. In the usecase this is targeted at the VM was executing only for
> a couple seconds at most. Usually much less then that (we get about
> ~80 resets/s with AFL). During that time its extremely unlikely you
> get more then a ~100 pages deduplicated (that is, written to). But
> even if there are more pages, it just means the hypercall might take a
> bit longer to run for that iteration.

I assume if you upstream the code then you want more people to use it 
(otherwise what's the point?). In this case, you will likely have people 
that heard about the feature, wants to test but don't know the internal.

Such users need to know how this can be call safely without reading the 
implementation. In other words, some documentation for your hypercall is 
needed.

> I don't see any issue with not
> breaking up this hypercall with continuation even under the worst case
> situation though.

Xen only supports voluntary preemption, this means that an hypercall can 
only be preempted if there is code for it. Otherwise the preemption will 
mostly only happen when returning to the guest.

In other words, the vCPU executing the hypercall may go past its 
timeslice and prevent other vCPU to run.

There are other problem with long running hypercalls. Anyway, in short, 
if you ever want to get you code supported then you will need the 
hypercall to be broken down.

> But if others feel that strongly as well about
> having to have continuation for this I don't really mind adding it.

I don't think the continuation work is going to be difficult, but if you 
want to delay it, then the minimum is to document such assumptions for 
your users.

Cheers,
Jan Beulich Dec. 19, 2019, 11:06 a.m. UTC | #6
On 19.12.2019 01:15, Tamas K Lengyel wrote:
> On Wed, Dec 18, 2019 at 4:02 PM Julien Grall <julien@xen.org> wrote:
>> On 18/12/2019 22:33, Tamas K Lengyel wrote:
>>> On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
>>>> You also have multiple loop on the page_list in this function. Given the
>>>> number of page_list can be quite big, this is a call for hogging the
>>>> pCPU and an RCU lock on the domain vCPU running this call.
>>>
>>> There is just one loop over page_list itself, the second loop is on
>>> the internal list that is being built here which will be a subset. The
>>> list itself in fact should be small (in our tests usually <100).
>>
>> For a first, nothing in this function tells me that there will be only
>> 100 pages. But then, I don't think this is right to implement your
>> hypercall based only the  "normal" scenario. You should also think about
>> the "worst" case scenario.
>>
>> In this case the worst case scenario is have hundreds of page in page_list.
> 
> Well, this is only an experimental system that's completely disabled
> by default. Making the assumption that people who make use of it will
> know what they are doing I think is fair.

FWIW I'm with Julien here: The preferred course of action is to make
the operation safe against abuse. The minimum requirement is to
document obvious missing pieces for this to become supported code.

Jan
Tamas K Lengyel Dec. 19, 2019, 4:02 p.m. UTC | #7
On Thu, Dec 19, 2019 at 4:05 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 19.12.2019 01:15, Tamas K Lengyel wrote:
> > On Wed, Dec 18, 2019 at 4:02 PM Julien Grall <julien@xen.org> wrote:
> >> On 18/12/2019 22:33, Tamas K Lengyel wrote:
> >>> On Wed, Dec 18, 2019 at 3:00 PM Julien Grall <julien@xen.org> wrote:
> >>>> You also have multiple loop on the page_list in this function. Given the
> >>>> number of page_list can be quite big, this is a call for hogging the
> >>>> pCPU and an RCU lock on the domain vCPU running this call.
> >>>
> >>> There is just one loop over page_list itself, the second loop is on
> >>> the internal list that is being built here which will be a subset. The
> >>> list itself in fact should be small (in our tests usually <100).
> >>
> >> For a first, nothing in this function tells me that there will be only
> >> 100 pages. But then, I don't think this is right to implement your
> >> hypercall based only the  "normal" scenario. You should also think about
> >> the "worst" case scenario.
> >>
> >> In this case the worst case scenario is have hundreds of page in page_list.
> >
> > Well, this is only an experimental system that's completely disabled
> > by default. Making the assumption that people who make use of it will
> > know what they are doing I think is fair.
>
> FWIW I'm with Julien here: The preferred course of action is to make
> the operation safe against abuse. The minimum requirement is to
> document obvious missing pieces for this to become supported code.

That's perfectly fine by me.

Tamas
Tamas K Lengyel Dec. 19, 2019, 4:11 p.m. UTC | #8
> > Well, this is only an experimental system that's completely disabled
> > by default. Making the assumption that people who make use of it will
> > know what they are doing I think is fair.
>
> I assume that if you submit to upstream this new hypercall then there is
> longer plan to have more people to use it and potentially making
> "stable". If not, then it raises the question why this is pushed upstream...

It's being pushed upstream so other people can make use of it, even if
it's not "production quality". Beyond what is being sent here there
are no longer term plans from Intel (at this point) to support this in
any way. The alternative would be that we just release a fork (or just
the patches) and walk away. If the Xen community wants to make the
announcement that only code that will have long term support and is
"stable" is accepted upstream that's IMHO drastically going to reduce
people's interest to share anything.

> In any case, all the known assumptions should be documented so they can
> be fixed rather than forgotten until it is rediscovered via an XSA.

Fair enough.

> >
> >>
> >>> Granted the list can grow larger, but in those cases its likely better
> >>> to just discard the fork and create a new one. So in my opinion adding
> >>> a hypercall continuation to this not needed
> >>
> >> How would the caller know it? What would happen if the caller ends up to
> >> call this with a growing list.
> >
> > The caller knows by virtue of knowing how long the VM was executed
> > for. In the usecase this is targeted at the VM was executing only for
> > a couple seconds at most. Usually much less then that (we get about
> > ~80 resets/s with AFL). During that time its extremely unlikely you
> > get more then a ~100 pages deduplicated (that is, written to). But
> > even if there are more pages, it just means the hypercall might take a
> > bit longer to run for that iteration.
>
> I assume if you upstream the code then you want more people to use it
> (otherwise what's the point?). In this case, you will likely have people
> that heard about the feature, wants to test but don't know the internal.
>
> Such users need to know how this can be call safely without reading the
> implementation. In other words, some documentation for your hypercall is
> needed.

Sure.

>
> > I don't see any issue with not
> > breaking up this hypercall with continuation even under the worst case
> > situation though.
>
> Xen only supports voluntary preemption, this means that an hypercall can
> only be preempted if there is code for it. Otherwise the preemption will
> mostly only happen when returning to the guest.
>
> In other words, the vCPU executing the hypercall may go past its
> timeslice and prevent other vCPU to run.
>
> There are other problem with long running hypercalls. Anyway, in short,
> if you ever want to get you code supported then you will need the
> hypercall to be broken down.
>
> > But if others feel that strongly as well about
> > having to have continuation for this I don't really mind adding it.
>
> I don't think the continuation work is going to be difficult, but if you
> want to delay it, then the minimum is to document such assumptions for
> your users.

I just don't see a use for it because it will never actually execute.
So to me it just looks like unnecessary dead glue. But documenting the
assumption under which this hypercall should execute is perfectly
fair.

Tamas
Julien Grall Dec. 19, 2019, 4:57 p.m. UTC | #9
Hi,

On 19/12/2019 16:11, Tamas K Lengyel wrote:
>>> Well, this is only an experimental system that's completely disabled
>>> by default. Making the assumption that people who make use of it will
>>> know what they are doing I think is fair.
>>
>> I assume that if you submit to upstream this new hypercall then there is
>> longer plan to have more people to use it and potentially making
>> "stable". If not, then it raises the question why this is pushed upstream...
> 
> It's being pushed upstream so other people can make use of it, even if
> it's not "production quality". Beyond what is being sent here there
> are no longer term plans from Intel (at this point) to support this in
> any way. 

So if this is merged, who is going to maintain it?

> The alternative would be that we just release a fork (or just
> the patches) and walk away.
>  If the Xen community wants to make the
> announcement that only code that will have long term support and is
> "stable" is accepted upstream that's IMHO drastically going to reduce
> people's interest to share anything.

Sharing is one thing, but if this code is not at least a minimum 
maintained then it is likely the code will not be functional in a year time.

Don't get me wrong, this is a cool feature to have but you make it 
sounds like this is going to be dumped in Xen and never going to be 
touched again. How is this going to be beneficial for the community?

> 
>> In any case, all the known assumptions should be documented so they can
>> be fixed rather than forgotten until it is rediscovered via an XSA.
> 
> Fair enough.
> 
>>>
>>>>
>>>>> Granted the list can grow larger, but in those cases its likely better
>>>>> to just discard the fork and create a new one. So in my opinion adding
>>>>> a hypercall continuation to this not needed
>>>>
>>>> How would the caller know it? What would happen if the caller ends up to
>>>> call this with a growing list.
>>>
>>> The caller knows by virtue of knowing how long the VM was executed
>>> for. In the usecase this is targeted at the VM was executing only for
>>> a couple seconds at most. Usually much less then that (we get about
>>> ~80 resets/s with AFL). During that time its extremely unlikely you
>>> get more then a ~100 pages deduplicated (that is, written to). But
>>> even if there are more pages, it just means the hypercall might take a
>>> bit longer to run for that iteration.
>>
>> I assume if you upstream the code then you want more people to use it
>> (otherwise what's the point?). In this case, you will likely have people
>> that heard about the feature, wants to test but don't know the internal.
>>
>> Such users need to know how this can be call safely without reading the
>> implementation. In other words, some documentation for your hypercall is
>> needed.
> 
> Sure.
> 
>>
>>> I don't see any issue with not
>>> breaking up this hypercall with continuation even under the worst case
>>> situation though.
>>
>> Xen only supports voluntary preemption, this means that an hypercall can
>> only be preempted if there is code for it. Otherwise the preemption will
>> mostly only happen when returning to the guest.
>>
>> In other words, the vCPU executing the hypercall may go past its
>> timeslice and prevent other vCPU to run.
>>
>> There are other problem with long running hypercalls. Anyway, in short,
>> if you ever want to get you code supported then you will need the
>> hypercall to be broken down.
>>
>>> But if others feel that strongly as well about
>>> having to have continuation for this I don't really mind adding it.
>>
>> I don't think the continuation work is going to be difficult, but if you
>> want to delay it, then the minimum is to document such assumptions for
>> your users.
> 
> I just don't see a use for it because it will never actually execute.

That's a very narrow view of how your hypercall can be used. I know that 
you said that should only be called early, but imagine for a moment the 
user decide to call it much later in the fork process.

> So to me it just looks like unnecessary dead glue. 

Try to call the hypercall after enough deduplication happen (maybe 
20min). Alternatively, give me access to your machine with the code and 
I can show how it can be misused ;).

> But documenting the
> assumption under which this hypercall should execute is perfectly
> fair.

You might want to think how the interface would look like with the 
preemption. So the day you decide to introduce preemption you don't have 
to create a new hypercall.

Cheers,
Tamas K Lengyel Dec. 19, 2019, 5:23 p.m. UTC | #10
On Thu, Dec 19, 2019 at 9:58 AM Julien Grall <julien@xen.org> wrote:
>
> Hi,
>
> On 19/12/2019 16:11, Tamas K Lengyel wrote:
> >>> Well, this is only an experimental system that's completely disabled
> >>> by default. Making the assumption that people who make use of it will
> >>> know what they are doing I think is fair.
> >>
> >> I assume that if you submit to upstream this new hypercall then there is
> >> longer plan to have more people to use it and potentially making
> >> "stable". If not, then it raises the question why this is pushed upstream...
> >
> > It's being pushed upstream so other people can make use of it, even if
> > it's not "production quality". Beyond what is being sent here there
> > are no longer term plans from Intel (at this point) to support this in
> > any way.
>
> So if this is merged, who is going to maintain it?

It falls under mem_sharing for which I'm listed as "Odd Fixes"
maintainer, which I do in my personal free time. The status there
isn't changing with this new feature.

>
> > The alternative would be that we just release a fork (or just
> > the patches) and walk away.
> >  If the Xen community wants to make the
> > announcement that only code that will have long term support and is
> > "stable" is accepted upstream that's IMHO drastically going to reduce
> > people's interest to share anything.
>
> Sharing is one thing, but if this code is not at least a minimum
> maintained then it is likely the code will not be functional in a year time.

Surprisingly mem_sharing had only minor bitrots in the last ~5 years
in which time it has been pretty much abandoned.

>
> Don't get me wrong, this is a cool feature to have but you make it
> sounds like this is going to be dumped in Xen and never going to be
> touched again. How is this going to be beneficial for the community?

It adds an experimental feature to Xen that people can try out and
well experiment with! It may be useful, it may not be. You yourself
said that this is a cool feature. The fact that there is a JIRA ticket
tracking this also shows there is community interest in having such a
feature available. If down the road that changes and this proves to be
dead code, it can be removed. I don't think that will be necessary
since this isn't even compiled by default anymore though. But anyway,
it makes more sense to get it upstream then carry it out of tree
because it gets more exposure that way, more people may try it out.
Having it in-tree also means that in a couple releases people who are
interested in the feature don't have to go through the process of
rebasing the patchset on newer versions of Xen since it's already
in-tree.

> >>> But if others feel that strongly as well about
> >>> having to have continuation for this I don't really mind adding it.
> >>
> >> I don't think the continuation work is going to be difficult, but if you
> >> want to delay it, then the minimum is to document such assumptions for
> >> your users.
> >
> > I just don't see a use for it because it will never actually execute.
>
> That's a very narrow view of how your hypercall can be used. I know that
> you said that should only be called early, but imagine for a moment the
> user decide to call it much later in the fork process.
>
> > So to me it just looks like unnecessary dead glue.
>
> Try to call the hypercall after enough deduplication happen (maybe
> 20min). Alternatively, give me access to your machine with the code and
> I can show how it can be misused ;).

It will hang for a bit for sure and Linux in dom0 will complain that a
CPU is stuck. But it will eventually finish. It's not like it's doing
all that much. And anyway, if you notice that happening when you call
it it will be an obvious clue that you shouldn't be using it under the
situation you are using it under. Having continuation would hide that.

>
> > But documenting the
> > assumption under which this hypercall should execute is perfectly
> > fair.
>
> You might want to think how the interface would look like with the
> preemption. So the day you decide to introduce preemption you don't have
> to create a new hypercall.

Why would you need to introduce a new hypercall if preemption becomes
necessary? This is not a stable interfaces. It can be removed/changed
completely from one Xen release to the next.

Tamas
Julien Grall Dec. 19, 2019, 5:38 p.m. UTC | #11
On 19/12/2019 17:23, Tamas K Lengyel wrote:
> On Thu, Dec 19, 2019 at 9:58 AM Julien Grall <julien@xen.org> wrote:
>>
>> Hi,
>>
>> On 19/12/2019 16:11, Tamas K Lengyel wrote:
>>>>> Well, this is only an experimental system that's completely disabled
>>>>> by default. Making the assumption that people who make use of it will
>>>>> know what they are doing I think is fair.
>>>>
>>>> I assume that if you submit to upstream this new hypercall then there is
>>>> longer plan to have more people to use it and potentially making
>>>> "stable". If not, then it raises the question why this is pushed upstream...
>>>
>>> It's being pushed upstream so other people can make use of it, even if
>>> it's not "production quality". Beyond what is being sent here there
>>> are no longer term plans from Intel (at this point) to support this in
>>> any way.
>>
>> So if this is merged, who is going to maintain it?
> 
> It falls under mem_sharing for which I'm listed as "Odd Fixes"
> maintainer, which I do in my personal free time. The status there
> isn't changing with this new feature.
> 
>>
>>> The alternative would be that we just release a fork (or just
>>> the patches) and walk away.
>>>   If the Xen community wants to make the
>>> announcement that only code that will have long term support and is
>>> "stable" is accepted upstream that's IMHO drastically going to reduce
>>> people's interest to share anything.
>>
>> Sharing is one thing, but if this code is not at least a minimum
>> maintained then it is likely the code will not be functional in a year time.
> 
> Surprisingly mem_sharing had only minor bitrots in the last ~5 years
> in which time it has been pretty much abandoned.
This falls under a "minimum maintained". This wasn't clear from your 
previous statement stating there will be no support.

[...]

> 
>>>>> But if others feel that strongly as well about
>>>>> having to have continuation for this I don't really mind adding it.
>>>>
>>>> I don't think the continuation work is going to be difficult, but if you
>>>> want to delay it, then the minimum is to document such assumptions for
>>>> your users.
>>>
>>> I just don't see a use for it because it will never actually execute.
>>
>> That's a very narrow view of how your hypercall can be used. I know that
>> you said that should only be called early, but imagine for a moment the
>> user decide to call it much later in the fork process.
>>
>>> So to me it just looks like unnecessary dead glue.
>>
>> Try to call the hypercall after enough deduplication happen (maybe
>> 20min). Alternatively, give me access to your machine with the code and
>> I can show how it can be misused ;).
> 
> It will hang for a bit for sure and Linux in dom0 will complain that a
> CPU is stuck. But it will eventually finish. It's not like it's doing
> all that much. And anyway, if you notice that happening when you call
> it it will be an obvious clue that you shouldn't be using it under the
> situation you are using it under. Having continuation would hide that.

I am not going to argue more as this is an experimental feature. But 
this will be a showstopper if we ever consider mem_sharing to be 
supported (or even security supported).

Meanwhile please document the assumption.

> 
>>
>>> But documenting the
>>> assumption under which this hypercall should execute is perfectly
>>> fair.
>>
>> You might want to think how the interface would look like with the
>> preemption. So the day you decide to introduce preemption you don't have
>> to create a new hypercall.
> 
> Why would you need to introduce a new hypercall if preemption becomes
> necessary? This is not a stable interfaces. It can be removed/changed
> completely from one Xen release to the next.

Sorry, I didn't realize the mem_sharing code was not a stable interfaces.

Cheers,
Tamas K Lengyel Dec. 19, 2019, 6 p.m. UTC | #12
> >>> The alternative would be that we just release a fork (or just
> >>> the patches) and walk away.
> >>>   If the Xen community wants to make the
> >>> announcement that only code that will have long term support and is
> >>> "stable" is accepted upstream that's IMHO drastically going to reduce
> >>> people's interest to share anything.
> >>
> >> Sharing is one thing, but if this code is not at least a minimum
> >> maintained then it is likely the code will not be functional in a year time.
> >
> > Surprisingly mem_sharing had only minor bitrots in the last ~5 years
> > in which time it has been pretty much abandoned.
> This falls under a "minimum maintained". This wasn't clear from your
> previous statement stating there will be no support.

Sure, I meant there is no support from Intel (ie. it's not part of my
job-description nor do I get payed to support this long-term). I
usually check during the RC test days that it's at least functional by
doing some testing manually. But it's pretty ad-hoc when and if I do
that (hence the Odd fixes status).

> >
> >>>>> But if others feel that strongly as well about
> >>>>> having to have continuation for this I don't really mind adding it.
> >>>>
> >>>> I don't think the continuation work is going to be difficult, but if you
> >>>> want to delay it, then the minimum is to document such assumptions for
> >>>> your users.
> >>>
> >>> I just don't see a use for it because it will never actually execute.
> >>
> >> That's a very narrow view of how your hypercall can be used. I know that
> >> you said that should only be called early, but imagine for a moment the
> >> user decide to call it much later in the fork process.
> >>
> >>> So to me it just looks like unnecessary dead glue.
> >>
> >> Try to call the hypercall after enough deduplication happen (maybe
> >> 20min). Alternatively, give me access to your machine with the code and
> >> I can show how it can be misused ;).
> >
> > It will hang for a bit for sure and Linux in dom0 will complain that a
> > CPU is stuck. But it will eventually finish. It's not like it's doing
> > all that much. And anyway, if you notice that happening when you call
> > it it will be an obvious clue that you shouldn't be using it under the
> > situation you are using it under. Having continuation would hide that.
>
> I am not going to argue more as this is an experimental feature. But
> this will be a showstopper if we ever consider mem_sharing to be
> supported (or even security supported).
>
> Meanwhile please document the assumption.

Ack, already did.

Thanks,
Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index e93ad2ec5a..4735a334b9 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1622,6 +1622,87 @@  static int mem_sharing_fork(struct domain *d, struct domain *cd)
     return 0;
 }
 
+struct gfn_free;
+struct gfn_free {
+    struct gfn_free *next;
+    struct page_info *page;
+    gfn_t gfn;
+};
+
+static int mem_sharing_fork_reset(struct domain *d, struct domain *cd)
+{
+    int rc;
+
+    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
+    struct gfn_free *list = NULL;
+    struct page_info *page;
+
+    page_list_for_each(page, &cd->page_list)
+    {
+        mfn_t mfn = page_to_mfn(page);
+        if ( mfn_valid(mfn) )
+        {
+            p2m_type_t p2mt;
+            p2m_access_t p2ma;
+            gfn_t gfn = mfn_to_gfn(cd, mfn);
+            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
+                                        0, NULL, false);
+            if ( p2m_is_ram(p2mt) )
+            {
+                struct gfn_free *gfn_free;
+                if ( !get_page(page, cd) )
+                    goto err_reset;
+
+                /*
+                 * We can't free the page while iterating over the page_list
+                 * so we build a separate list to loop over.
+                 *
+                 * We want to iterate over the page_list instead of checking
+                 * gfn from 0 to max_gfn because this is ~10x faster.
+                 */
+                gfn_free = xmalloc(struct gfn_free);
+                if ( !gfn_free )
+                    goto err_reset;
+
+                gfn_free->gfn = gfn;
+                gfn_free->page = page;
+                gfn_free->next = list;
+                list = gfn_free;
+            }
+        }
+    }
+
+    while ( list )
+    {
+        struct gfn_free *next = list->next;
+
+        rc = p2m->set_entry(p2m, list->gfn, INVALID_MFN, PAGE_ORDER_4K,
+                            p2m_invalid, p2m_access_rwx, -1);
+        put_page_alloc_ref(list->page);
+        put_page(list->page);
+
+        xfree(list);
+        list = next;
+
+        ASSERT(!rc);
+    }
+
+    if ( (rc = fork_hvm(d, cd)) )
+        return rc;
+
+ err_reset:
+    while ( list )
+    {
+        struct gfn_free *next = list->next;
+
+        put_page(list->page);
+        xfree(list);
+        list = next;
+    }
+
+    return 0;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -1905,6 +1986,30 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
             rcu_unlock_domain(pd);
             break;
         }
+
+        case XENMEM_sharing_op_fork_reset:
+        {
+            struct domain *pd;
+
+            rc = -EINVAL;
+            if ( mso.u.fork._pad[0] || mso.u.fork._pad[1] ||
+                 mso.u.fork._pad[2] )
+                 goto out;
+
+            rc = -ENOSYS;
+            if ( !d->parent )
+                goto out;
+
+            rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, &pd);
+            if ( rc )
+                goto out;
+
+            rc = mem_sharing_fork_reset(pd, d);
+
+            rcu_unlock_domain(pd);
+            break;
+        }
+
         default:
             rc = -ENOSYS;
             break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 90a3f4498e..e3d063e22e 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -483,6 +483,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
 #define XENMEM_sharing_op_fork              9
+#define XENMEM_sharing_op_fork_reset        10
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)