diff mbox series

[v2,4/8] x86/mem-sharing: copy GADDR based shared guest areas

Message ID dad36e4c-4529-6836-c50e-7c5febb8eea4@suse.com (mailing list archive)
State New, archived
Headers show
Series runstate/time area registration by (guest) physical address | expand

Commit Message

Jan Beulich Jan. 23, 2023, 2:55 p.m. UTC
In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, add the necessary fork handling (with the
backing function yet to be filled in).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Tamas K Lengyel Jan. 23, 2023, 4:09 p.m. UTC | #1
On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, add the necessary fork handling (with the
> backing function yet to be filled in).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>  }
>
> +static int copy_guest_area(struct guest_area *cd_area,
> +                           const struct guest_area *d_area,
> +                           struct vcpu *cd_vcpu,
> +                           const struct domain *d)
> +{
> +    mfn_t d_mfn, cd_mfn;
> +
> +    if ( !d_area->pg )
> +        return 0;
> +
> +    d_mfn = page_to_mfn(d_area->pg);
> +
> +    /* Allocate & map a page for the area if it hasn't been already. */
> +    if ( !cd_area->pg )
> +    {
> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> +        p2m_type_t p2mt;
> +        p2m_access_t p2ma;
> +        unsigned int offset;
> +        int ret;
> +
> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> +        {
> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain,
0);
> +
> +            if ( !pg )
> +                return -ENOMEM;
> +
> +            cd_mfn = page_to_mfn(pg);
> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> +
> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
p2m_ram_rw,
> +                                 p2m->default_access, -1);
> +            if ( ret )
> +                return ret;
> +        }
> +        else if ( p2mt != p2m_ram_rw )
> +            return -EBUSY;
> +
> +        /*
> +         * Simply specify the entire range up to the end of the page.
All the
> +         * function uses it for is a check for not crossing page
boundaries.
> +         */
> +        offset = PAGE_OFFSET(d_area->map);
> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> +                             PAGE_SIZE - offset, cd_area, NULL);
> +        if ( ret )
> +            return ret;
> +    }
> +    else
> +        cd_mfn = page_to_mfn(cd_area->pg);

Everything to this point seems to be non mem-sharing/forking related. Could
these live somewhere else? There must be some other place where allocating
these areas happens already for non-fork VMs so it would make sense to just
refactor that code to be callable from here.

> +
> +    copy_domain_page(cd_mfn, d_mfn);
> +
> +    return 0;
> +}
> +
>  static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
>  {
>      struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
> @@ -1745,6 +1804,16 @@ static int copy_vcpu_settings(struct dom
>              copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
>          }
>
> +        /* Same for the (physically registered) runstate and time info
areas. */
> +        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
> +                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
> +        if ( ret )
> +            return ret;
> +        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
> +                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
> +        if ( ret )
> +            return ret;
> +
>          ret = copy_vpmu(d_vcpu, cd_vcpu);
>          if ( ret )
>              return ret;
> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
>
>   state:
>      if ( reset_state )
> +    {
>          rc = copy_settings(d, pd);
> +        /* TBD: What to do here with -ERESTART? */

Where does ERESTART coming from?
Jan Beulich Jan. 23, 2023, 4:24 p.m. UTC | #2
On 23.01.2023 17:09, Tamas K Lengyel wrote:
> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
>> --- a/xen/arch/x86/mm/mem_sharing.c
>> +++ b/xen/arch/x86/mm/mem_sharing.c
>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>  }
>>
>> +static int copy_guest_area(struct guest_area *cd_area,
>> +                           const struct guest_area *d_area,
>> +                           struct vcpu *cd_vcpu,
>> +                           const struct domain *d)
>> +{
>> +    mfn_t d_mfn, cd_mfn;
>> +
>> +    if ( !d_area->pg )
>> +        return 0;
>> +
>> +    d_mfn = page_to_mfn(d_area->pg);
>> +
>> +    /* Allocate & map a page for the area if it hasn't been already. */
>> +    if ( !cd_area->pg )
>> +    {
>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>> +        p2m_type_t p2mt;
>> +        p2m_access_t p2ma;
>> +        unsigned int offset;
>> +        int ret;
>> +
>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>> +        {
>> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain,
> 0);
>> +
>> +            if ( !pg )
>> +                return -ENOMEM;
>> +
>> +            cd_mfn = page_to_mfn(pg);
>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>> +
>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
> p2m_ram_rw,
>> +                                 p2m->default_access, -1);
>> +            if ( ret )
>> +                return ret;
>> +        }
>> +        else if ( p2mt != p2m_ram_rw )
>> +            return -EBUSY;
>> +
>> +        /*
>> +         * Simply specify the entire range up to the end of the page.
> All the
>> +         * function uses it for is a check for not crossing page
> boundaries.
>> +         */
>> +        offset = PAGE_OFFSET(d_area->map);
>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>> +                             PAGE_SIZE - offset, cd_area, NULL);
>> +        if ( ret )
>> +            return ret;
>> +    }
>> +    else
>> +        cd_mfn = page_to_mfn(cd_area->pg);
> 
> Everything to this point seems to be non mem-sharing/forking related. Could
> these live somewhere else? There must be some other place where allocating
> these areas happens already for non-fork VMs so it would make sense to just
> refactor that code to be callable from here.

It is the "copy" aspect with makes this mem-sharing (or really fork)
specific. Plus in the end this is no different from what you have
there right now for copying the vCPU info area. In the final patch
that other code gets removed by re-using the code here.

I also haven't been able to spot anything that could be factored
out (and one might expect that if there was something, then the vCPU
info area copying should also already have used it). map_guest_area()
is all that is used for other purposes as well.

>> +
>> +    copy_domain_page(cd_mfn, d_mfn);
>> +
>> +    return 0;
>> +}
>> +
>>  static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
>>  {
>>      struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
>> @@ -1745,6 +1804,16 @@ static int copy_vcpu_settings(struct dom
>>              copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
>>          }
>>
>> +        /* Same for the (physically registered) runstate and time info
> areas. */
>> +        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
>> +                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
>> +        if ( ret )
>> +            return ret;
>> +        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
>> +                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
>> +        if ( ret )
>> +            return ret;
>> +
>>          ret = copy_vpmu(d_vcpu, cd_vcpu);
>>          if ( ret )
>>              return ret;
>> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
>>
>>   state:
>>      if ( reset_state )
>> +    {
>>          rc = copy_settings(d, pd);
>> +        /* TBD: What to do here with -ERESTART? */
> 
> Where does ERESTART coming from?

From map_guest_area()'s attempt to acquire the hypercall deadlock mutex,
in order to then pause the subject vCPU. I suppose that in the forking
case it may already be paused, but then there's no way map_guest_area()
could know. Looking at the pause count is fragile, as there's no
guarantee that the vCPU may be unpaused while we're still doing work on
it. Hence I view such checks as only suitable for assertions.

Jan
Tamas K Lengyel Jan. 23, 2023, 6:32 p.m. UTC | #3
On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 23.01.2023 17:09, Tamas K Lengyel wrote:
> > On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
> >> --- a/xen/arch/x86/mm/mem_sharing.c
> >> +++ b/xen/arch/x86/mm/mem_sharing.c
> >> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
> >>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>  }
> >>
> >> +static int copy_guest_area(struct guest_area *cd_area,
> >> +                           const struct guest_area *d_area,
> >> +                           struct vcpu *cd_vcpu,
> >> +                           const struct domain *d)
> >> +{
> >> +    mfn_t d_mfn, cd_mfn;
> >> +
> >> +    if ( !d_area->pg )
> >> +        return 0;
> >> +
> >> +    d_mfn = page_to_mfn(d_area->pg);
> >> +
> >> +    /* Allocate & map a page for the area if it hasn't been already.
*/
> >> +    if ( !cd_area->pg )
> >> +    {
> >> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> >> +        p2m_type_t p2mt;
> >> +        p2m_access_t p2ma;
> >> +        unsigned int offset;
> >> +        int ret;
> >> +
> >> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
NULL);
> >> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >> +        {
> >> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain,
> > 0);
> >> +
> >> +            if ( !pg )
> >> +                return -ENOMEM;
> >> +
> >> +            cd_mfn = page_to_mfn(pg);
> >> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >> +
> >> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
> > p2m_ram_rw,
> >> +                                 p2m->default_access, -1);
> >> +            if ( ret )
> >> +                return ret;
> >> +        }
> >> +        else if ( p2mt != p2m_ram_rw )
> >> +            return -EBUSY;
> >> +
> >> +        /*
> >> +         * Simply specify the entire range up to the end of the page.
> > All the
> >> +         * function uses it for is a check for not crossing page
> > boundaries.
> >> +         */
> >> +        offset = PAGE_OFFSET(d_area->map);
> >> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> >> +                             PAGE_SIZE - offset, cd_area, NULL);
> >> +        if ( ret )
> >> +            return ret;
> >> +    }
> >> +    else
> >> +        cd_mfn = page_to_mfn(cd_area->pg);
> >
> > Everything to this point seems to be non mem-sharing/forking related.
Could
> > these live somewhere else? There must be some other place where
allocating
> > these areas happens already for non-fork VMs so it would make sense to
just
> > refactor that code to be callable from here.
>
> It is the "copy" aspect with makes this mem-sharing (or really fork)
> specific. Plus in the end this is no different from what you have
> there right now for copying the vCPU info area. In the final patch
> that other code gets removed by re-using the code here.

Yes, the copy part is fork-specific. Arguably if there was a way to do the
allocation of the page for vcpu_info I would prefer that being elsewhere,
but while the only requirement is allocate-page and copy from parent I'm OK
with that logic being in here because it's really straight forward. But now
you also do extra sanity checks here which are harder to comprehend in this
context alone. What if extra sanity checks will be needed in the future? Or
the sanity checks in the future diverge from where this happens for normal
VMs because someone overlooks this needing to be synched here too?

> I also haven't been able to spot anything that could be factored
> out (and one might expect that if there was something, then the vCPU
> info area copying should also already have used it). map_guest_area()
> is all that is used for other purposes as well.

Well, there must be a location where all this happens for normal VMs as
well, no? Why not factor that code so that it can be called from here, so
that we don't have to track sanity check requirements in two different
locations? Or for normal VMs that sanity checking bit isn't required? If
so, why?

> >> +
> >> +    copy_domain_page(cd_mfn, d_mfn);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
> >>  {
> >>      struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
> >> @@ -1745,6 +1804,16 @@ static int copy_vcpu_settings(struct dom
> >>              copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> >>          }
> >>
> >> +        /* Same for the (physically registered) runstate and time info
> > areas. */
> >> +        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
> >> +                              &d_vcpu->runstate_guest_area, cd_vcpu,
d);
> >> +        if ( ret )
> >> +            return ret;
> >> +        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
> >> +                              &d_vcpu->arch.time_guest_area, cd_vcpu,
d);
> >> +        if ( ret )
> >> +            return ret;
> >> +
> >>          ret = copy_vpmu(d_vcpu, cd_vcpu);
> >>          if ( ret )
> >>              return ret;
> >> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
> >>
> >>   state:
> >>      if ( reset_state )
> >> +    {
> >>          rc = copy_settings(d, pd);
> >> +        /* TBD: What to do here with -ERESTART? */
> >
> > Where does ERESTART coming from?
>
> From map_guest_area()'s attempt to acquire the hypercall deadlock mutex,
> in order to then pause the subject vCPU. I suppose that in the forking
> case it may already be paused, but then there's no way map_guest_area()
> could know. Looking at the pause count is fragile, as there's no
> guarantee that the vCPU may be unpaused while we're still doing work on
> it. Hence I view such checks as only suitable for assertions.

Since map_guest_area is only used to sanity check, and it only happens when
the page is being setup for the fork, why can't the sanity check be done on
the parent? The parent is guaranteed to be paused when forks are active so
there is no ERESTART concern and from the looks of it if there is a concern
the sanity check is looking for it would be visible on the parent just as
well as the fork. But I would like to understand why that sanity checking
is required in the first place.

Thanks,
Tamas
Jan Beulich Jan. 24, 2023, 11:19 a.m. UTC | #4
On 23.01.2023 19:32, Tamas K Lengyel wrote:
> On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com> wrote:
>> On 23.01.2023 17:09, Tamas K Lengyel wrote:
>>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
>>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>>>  }
>>>>
>>>> +static int copy_guest_area(struct guest_area *cd_area,
>>>> +                           const struct guest_area *d_area,
>>>> +                           struct vcpu *cd_vcpu,
>>>> +                           const struct domain *d)
>>>> +{
>>>> +    mfn_t d_mfn, cd_mfn;
>>>> +
>>>> +    if ( !d_area->pg )
>>>> +        return 0;
>>>> +
>>>> +    d_mfn = page_to_mfn(d_area->pg);
>>>> +
>>>> +    /* Allocate & map a page for the area if it hasn't been already.
> */
>>>> +    if ( !cd_area->pg )
>>>> +    {
>>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>>>> +        p2m_type_t p2mt;
>>>> +        p2m_access_t p2ma;
>>>> +        unsigned int offset;
>>>> +        int ret;
>>>> +
>>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
> NULL);
>>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>>>> +        {
>>>> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain,
>>> 0);
>>>> +
>>>> +            if ( !pg )
>>>> +                return -ENOMEM;
>>>> +
>>>> +            cd_mfn = page_to_mfn(pg);
>>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>>>> +
>>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
>>> p2m_ram_rw,
>>>> +                                 p2m->default_access, -1);
>>>> +            if ( ret )
>>>> +                return ret;
>>>> +        }
>>>> +        else if ( p2mt != p2m_ram_rw )
>>>> +            return -EBUSY;
>>>> +
>>>> +        /*
>>>> +         * Simply specify the entire range up to the end of the page.
>>> All the
>>>> +         * function uses it for is a check for not crossing page
>>> boundaries.
>>>> +         */
>>>> +        offset = PAGE_OFFSET(d_area->map);
>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
>>>> +        if ( ret )
>>>> +            return ret;
>>>> +    }
>>>> +    else
>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
>>>
>>> Everything to this point seems to be non mem-sharing/forking related.
> Could
>>> these live somewhere else? There must be some other place where
> allocating
>>> these areas happens already for non-fork VMs so it would make sense to
> just
>>> refactor that code to be callable from here.
>>
>> It is the "copy" aspect with makes this mem-sharing (or really fork)
>> specific. Plus in the end this is no different from what you have
>> there right now for copying the vCPU info area. In the final patch
>> that other code gets removed by re-using the code here.
> 
> Yes, the copy part is fork-specific. Arguably if there was a way to do the
> allocation of the page for vcpu_info I would prefer that being elsewhere,
> but while the only requirement is allocate-page and copy from parent I'm OK
> with that logic being in here because it's really straight forward. But now
> you also do extra sanity checks here which are harder to comprehend in this
> context alone.

What sanity checks are you talking about (also below, where you claim
map_guest_area() would be used only to sanity check)?

> What if extra sanity checks will be needed in the future? Or
> the sanity checks in the future diverge from where this happens for normal
> VMs because someone overlooks this needing to be synched here too?
> 
>> I also haven't been able to spot anything that could be factored
>> out (and one might expect that if there was something, then the vCPU
>> info area copying should also already have used it). map_guest_area()
>> is all that is used for other purposes as well.
> 
> Well, there must be a location where all this happens for normal VMs as
> well, no?

That's map_guest_area(). What is needed here but not elsewhere is the
populating of the GFN underlying the to-be-mapped area. That's the code
being added here, mirroring what you need to do for the vCPU info page.
Similar code isn't needed elsewhere because the guest invoked operation
is purely a "map" - the underlying pages are already expected to be
populated (which of course we check, or else we wouldn't know what page
to actually map).

> Why not factor that code so that it can be called from here, so
> that we don't have to track sanity check requirements in two different
> locations? Or for normal VMs that sanity checking bit isn't required? If
> so, why?

As per above, I'm afraid that I'm lost with these questions. I simply
don't know what you're talking about.

>>>> +
>>>> +    copy_domain_page(cd_mfn, d_mfn);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>  static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
>>>>  {
>>>>      struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
>>>> @@ -1745,6 +1804,16 @@ static int copy_vcpu_settings(struct dom
>>>>              copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
>>>>          }
>>>>
>>>> +        /* Same for the (physically registered) runstate and time info
>>> areas. */
>>>> +        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
>>>> +                              &d_vcpu->runstate_guest_area, cd_vcpu,
> d);
>>>> +        if ( ret )
>>>> +            return ret;
>>>> +        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
>>>> +                              &d_vcpu->arch.time_guest_area, cd_vcpu,
> d);
>>>> +        if ( ret )
>>>> +            return ret;
>>>> +
>>>>          ret = copy_vpmu(d_vcpu, cd_vcpu);
>>>>          if ( ret )
>>>>              return ret;
>>>> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
>>>>
>>>>   state:
>>>>      if ( reset_state )
>>>> +    {
>>>>          rc = copy_settings(d, pd);
>>>> +        /* TBD: What to do here with -ERESTART? */
>>>
>>> Where does ERESTART coming from?
>>
>> From map_guest_area()'s attempt to acquire the hypercall deadlock mutex,
>> in order to then pause the subject vCPU. I suppose that in the forking
>> case it may already be paused, but then there's no way map_guest_area()
>> could know. Looking at the pause count is fragile, as there's no
>> guarantee that the vCPU may be unpaused while we're still doing work on
>> it. Hence I view such checks as only suitable for assertions.
> 
> Since map_guest_area is only used to sanity check, and it only happens when
> the page is being setup for the fork, why can't the sanity check be done on
> the parent?

As above, I'm afraid I simply don't understand what you're asking.

> The parent is guaranteed to be paused when forks are active so
> there is no ERESTART concern and from the looks of it if there is a concern
> the sanity check is looking for it would be visible on the parent just as
> well as the fork.

The parent being paused isn't of interest to map_guest_area(). It's the
subject vcpu (i.e. in the forked instance) where we require this. Thinking
of it - the forked domain wasn't started yet, was it? We could then avoid
the pausing (and the acquiring of the hypercall deadlock mutex) based on
->creation_finished still being "false", or even simply based on
v->domain != current->domain. Then there wouldn't be any chance anymore of
-ERESTART making it here.

> But I would like to understand why that sanity checking
> is required in the first place.

See further up.

Jan
Tamas K Lengyel Jan. 25, 2023, 3:34 p.m. UTC | #5
On Tue, Jan 24, 2023 at 6:19 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 23.01.2023 19:32, Tamas K Lengyel wrote:
> > On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com> wrote:
> >> On 23.01.2023 17:09, Tamas K Lengyel wrote:
> >>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
> >>>> --- a/xen/arch/x86/mm/mem_sharing.c
> >>>> +++ b/xen/arch/x86/mm/mem_sharing.c
> >>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
> >>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>>>  }
> >>>>
> >>>> +static int copy_guest_area(struct guest_area *cd_area,
> >>>> +                           const struct guest_area *d_area,
> >>>> +                           struct vcpu *cd_vcpu,
> >>>> +                           const struct domain *d)
> >>>> +{
> >>>> +    mfn_t d_mfn, cd_mfn;
> >>>> +
> >>>> +    if ( !d_area->pg )
> >>>> +        return 0;
> >>>> +
> >>>> +    d_mfn = page_to_mfn(d_area->pg);
> >>>> +
> >>>> +    /* Allocate & map a page for the area if it hasn't been already.
> > */
> >>>> +    if ( !cd_area->pg )
> >>>> +    {
> >>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> >>>> +        p2m_type_t p2mt;
> >>>> +        p2m_access_t p2ma;
> >>>> +        unsigned int offset;
> >>>> +        int ret;
> >>>> +
> >>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
> > NULL);
> >>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >>>> +        {
> >>>> +            struct page_info *pg =
alloc_domheap_page(cd_vcpu->domain,
> >>> 0);
> >>>> +
> >>>> +            if ( !pg )
> >>>> +                return -ENOMEM;
> >>>> +
> >>>> +            cd_mfn = page_to_mfn(pg);
> >>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >>>> +
> >>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
> >>> p2m_ram_rw,
> >>>> +                                 p2m->default_access, -1);
> >>>> +            if ( ret )
> >>>> +                return ret;
> >>>> +        }
> >>>> +        else if ( p2mt != p2m_ram_rw )
> >>>> +            return -EBUSY;
> >>>> +
> >>>> +        /*
> >>>> +         * Simply specify the entire range up to the end of the
page.
> >>> All the
> >>>> +         * function uses it for is a check for not crossing page
> >>> boundaries.
> >>>> +         */
> >>>> +        offset = PAGE_OFFSET(d_area->map);
> >>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> >>>> +                             PAGE_SIZE - offset, cd_area, NULL);
> >>>> +        if ( ret )
> >>>> +            return ret;
> >>>> +    }
> >>>> +    else
> >>>> +        cd_mfn = page_to_mfn(cd_area->pg);
> >>>
> >>> Everything to this point seems to be non mem-sharing/forking related.
> > Could
> >>> these live somewhere else? There must be some other place where
> > allocating
> >>> these areas happens already for non-fork VMs so it would make sense to
> > just
> >>> refactor that code to be callable from here.
> >>
> >> It is the "copy" aspect with makes this mem-sharing (or really fork)
> >> specific. Plus in the end this is no different from what you have
> >> there right now for copying the vCPU info area. In the final patch
> >> that other code gets removed by re-using the code here.
> >
> > Yes, the copy part is fork-specific. Arguably if there was a way to do
the
> > allocation of the page for vcpu_info I would prefer that being
elsewhere,
> > but while the only requirement is allocate-page and copy from parent
I'm OK
> > with that logic being in here because it's really straight forward. But
now
> > you also do extra sanity checks here which are harder to comprehend in
this
> > context alone.
>
> What sanity checks are you talking about (also below, where you claim
> map_guest_area() would be used only to sanity check)?

Did I misread your comment above "All the function uses it for is a check
for not crossing page boundaries"? That sounds to me like a simple sanity
check, unclear why it matters though and why only for forks.

>
> > What if extra sanity checks will be needed in the future? Or
> > the sanity checks in the future diverge from where this happens for
normal
> > VMs because someone overlooks this needing to be synched here too?
> >
> >> I also haven't been able to spot anything that could be factored
> >> out (and one might expect that if there was something, then the vCPU
> >> info area copying should also already have used it). map_guest_area()
> >> is all that is used for other purposes as well.
> >
> > Well, there must be a location where all this happens for normal VMs as
> > well, no?
>
> That's map_guest_area(). What is needed here but not elsewhere is the
> populating of the GFN underlying the to-be-mapped area. That's the code
> being added here, mirroring what you need to do for the vCPU info page.
> Similar code isn't needed elsewhere because the guest invoked operation
> is purely a "map" - the underlying pages are already expected to be
> populated (which of course we check, or else we wouldn't know what page
> to actually map).

Populated by what and when?

>
> > Why not factor that code so that it can be called from here, so
> > that we don't have to track sanity check requirements in two different
> > locations? Or for normal VMs that sanity checking bit isn't required? If
> > so, why?
>
> As per above, I'm afraid that I'm lost with these questions. I simply
> don't know what you're talking about.

You are adding code here that allocates memory and copies the content of
similarly allocated memory from the parent. You perform extra sanity checks
for unknown reasons that seem to be only needed here. It is unclear why you
are doing that and why can't the same code-paths that allocate this memory
for the parent be simply reused so the only thing left to do here would be
to copy the content from the parent?

Tamas
Jan Beulich Jan. 26, 2023, 8:13 a.m. UTC | #6
On 25.01.2023 16:34, Tamas K Lengyel wrote:
> On Tue, Jan 24, 2023 at 6:19 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 23.01.2023 19:32, Tamas K Lengyel wrote:
>>> On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>> On 23.01.2023 17:09, Tamas K Lengyel wrote:
>>>>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>>>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
>>>>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>>>>>  }
>>>>>>
>>>>>> +static int copy_guest_area(struct guest_area *cd_area,
>>>>>> +                           const struct guest_area *d_area,
>>>>>> +                           struct vcpu *cd_vcpu,
>>>>>> +                           const struct domain *d)
>>>>>> +{
>>>>>> +    mfn_t d_mfn, cd_mfn;
>>>>>> +
>>>>>> +    if ( !d_area->pg )
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    d_mfn = page_to_mfn(d_area->pg);
>>>>>> +
>>>>>> +    /* Allocate & map a page for the area if it hasn't been already.
>>> */
>>>>>> +    if ( !cd_area->pg )
>>>>>> +    {
>>>>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>>>>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>>>>>> +        p2m_type_t p2mt;
>>>>>> +        p2m_access_t p2ma;
>>>>>> +        unsigned int offset;
>>>>>> +        int ret;
>>>>>> +
>>>>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
>>> NULL);
>>>>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>>>>>> +        {
>>>>>> +            struct page_info *pg =
> alloc_domheap_page(cd_vcpu->domain,
>>>>> 0);
>>>>>> +
>>>>>> +            if ( !pg )
>>>>>> +                return -ENOMEM;
>>>>>> +
>>>>>> +            cd_mfn = page_to_mfn(pg);
>>>>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>>>>>> +
>>>>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
>>>>> p2m_ram_rw,
>>>>>> +                                 p2m->default_access, -1);
>>>>>> +            if ( ret )
>>>>>> +                return ret;
>>>>>> +        }
>>>>>> +        else if ( p2mt != p2m_ram_rw )
>>>>>> +            return -EBUSY;
>>>>>> +
>>>>>> +        /*
>>>>>> +         * Simply specify the entire range up to the end of the
> page.
>>>>> All the
>>>>>> +         * function uses it for is a check for not crossing page
>>>>> boundaries.
>>>>>> +         */
>>>>>> +        offset = PAGE_OFFSET(d_area->map);
>>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
>>>>>> +        if ( ret )
>>>>>> +            return ret;
>>>>>> +    }
>>>>>> +    else
>>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
>>>>>
>>>>> Everything to this point seems to be non mem-sharing/forking related.
>>> Could
>>>>> these live somewhere else? There must be some other place where
>>> allocating
>>>>> these areas happens already for non-fork VMs so it would make sense to
>>> just
>>>>> refactor that code to be callable from here.
>>>>
>>>> It is the "copy" aspect with makes this mem-sharing (or really fork)
>>>> specific. Plus in the end this is no different from what you have
>>>> there right now for copying the vCPU info area. In the final patch
>>>> that other code gets removed by re-using the code here.
>>>
>>> Yes, the copy part is fork-specific. Arguably if there was a way to do
> the
>>> allocation of the page for vcpu_info I would prefer that being
> elsewhere,
>>> but while the only requirement is allocate-page and copy from parent
> I'm OK
>>> with that logic being in here because it's really straight forward. But
> now
>>> you also do extra sanity checks here which are harder to comprehend in
> this
>>> context alone.
>>
>> What sanity checks are you talking about (also below, where you claim
>> map_guest_area() would be used only to sanity check)?
> 
> Did I misread your comment above "All the function uses it for is a check
> for not crossing page boundaries"? That sounds to me like a simple sanity
> check, unclear why it matters though and why only for forks.

The comment is about the function's use of the range it is being passed.
It doesn't say in any way that the function is doing only sanity checking.
If the comment wording is ambiguous or unclear, I'm happy to take
improvement suggestions.

>>> What if extra sanity checks will be needed in the future? Or
>>> the sanity checks in the future diverge from where this happens for
> normal
>>> VMs because someone overlooks this needing to be synched here too?
>>>
>>>> I also haven't been able to spot anything that could be factored
>>>> out (and one might expect that if there was something, then the vCPU
>>>> info area copying should also already have used it). map_guest_area()
>>>> is all that is used for other purposes as well.
>>>
>>> Well, there must be a location where all this happens for normal VMs as
>>> well, no?
>>
>> That's map_guest_area(). What is needed here but not elsewhere is the
>> populating of the GFN underlying the to-be-mapped area. That's the code
>> being added here, mirroring what you need to do for the vCPU info page.
>> Similar code isn't needed elsewhere because the guest invoked operation
>> is purely a "map" - the underlying pages are already expected to be
>> populated (which of course we check, or else we wouldn't know what page
>> to actually map).
> 
> Populated by what and when?

Population happens either at domain creation or when the guest is moving
pages around (e.g. during ballooning). What is happening here (also in
the pre-existing code to deal with the vCPU info page) is the minimal
amount of "populate at creation" to meet the prereq for the mapping
operation(s).

>>> Why not factor that code so that it can be called from here, so
>>> that we don't have to track sanity check requirements in two different
>>> locations? Or for normal VMs that sanity checking bit isn't required? If
>>> so, why?
>>
>> As per above, I'm afraid that I'm lost with these questions. I simply
>> don't know what you're talking about.
> 
> You are adding code here that allocates memory and copies the content of
> similarly allocated memory from the parent. You perform extra sanity checks
> for unknown reasons that seem to be only needed here. It is unclear why you
> are doing that and why can't the same code-paths that allocate this memory
> for the parent be simply reused so the only thing left to do here would be
> to copy the content from the parent?

No, I'm not "adding code" in the sense that I read your comments so far.
Such code was already there (and, as pointed out somewhere, in slightly
broken form). Yes, I'm introducing a 2nd instance of this, but just to
then (in the last patch) remove the original (slightly broken) instance.
So across the entire series this is merely code movement (with
adjustments).

Jan
Tamas K Lengyel Jan. 26, 2023, 3:41 p.m. UTC | #7
On Thu, Jan 26, 2023 at 3:14 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.01.2023 16:34, Tamas K Lengyel wrote:
> > On Tue, Jan 24, 2023 at 6:19 AM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> On 23.01.2023 19:32, Tamas K Lengyel wrote:
> >>> On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com>
wrote:
> >>>> On 23.01.2023 17:09, Tamas K Lengyel wrote:
> >>>>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com>
wrote:
> >>>>>> --- a/xen/arch/x86/mm/mem_sharing.c
> >>>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
> >>>>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
> >>>>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>>>>>  }
> >>>>>>
> >>>>>> +static int copy_guest_area(struct guest_area *cd_area,
> >>>>>> +                           const struct guest_area *d_area,
> >>>>>> +                           struct vcpu *cd_vcpu,
> >>>>>> +                           const struct domain *d)
> >>>>>> +{
> >>>>>> +    mfn_t d_mfn, cd_mfn;
> >>>>>> +
> >>>>>> +    if ( !d_area->pg )
> >>>>>> +        return 0;
> >>>>>> +
> >>>>>> +    d_mfn = page_to_mfn(d_area->pg);
> >>>>>> +
> >>>>>> +    /* Allocate & map a page for the area if it hasn't been
already.
> >>> */
> >>>>>> +    if ( !cd_area->pg )
> >>>>>> +    {
> >>>>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >>>>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> >>>>>> +        p2m_type_t p2mt;
> >>>>>> +        p2m_access_t p2ma;
> >>>>>> +        unsigned int offset;
> >>>>>> +        int ret;
> >>>>>> +
> >>>>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
> >>> NULL);
> >>>>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >>>>>> +        {
> >>>>>> +            struct page_info *pg =
> > alloc_domheap_page(cd_vcpu->domain,
> >>>>> 0);
> >>>>>> +
> >>>>>> +            if ( !pg )
> >>>>>> +                return -ENOMEM;
> >>>>>> +
> >>>>>> +            cd_mfn = page_to_mfn(pg);
> >>>>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >>>>>> +
> >>>>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
> >>>>> p2m_ram_rw,
> >>>>>> +                                 p2m->default_access, -1);
> >>>>>> +            if ( ret )
> >>>>>> +                return ret;
> >>>>>> +        }
> >>>>>> +        else if ( p2mt != p2m_ram_rw )
> >>>>>> +            return -EBUSY;
> >>>>>> +
> >>>>>> +        /*
> >>>>>> +         * Simply specify the entire range up to the end of the
> > page.
> >>>>> All the
> >>>>>> +         * function uses it for is a check for not crossing page
> >>>>> boundaries.
> >>>>>> +         */
> >>>>>> +        offset = PAGE_OFFSET(d_area->map);
> >>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> >>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
> >>>>>> +        if ( ret )
> >>>>>> +            return ret;
> >>>>>> +    }
> >>>>>> +    else
> >>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
> >>>>>
> >>>>> Everything to this point seems to be non mem-sharing/forking
related.
> >>> Could
> >>>>> these live somewhere else? There must be some other place where
> >>> allocating
> >>>>> these areas happens already for non-fork VMs so it would make sense
to
> >>> just
> >>>>> refactor that code to be callable from here.
> >>>>
> >>>> It is the "copy" aspect with makes this mem-sharing (or really fork)
> >>>> specific. Plus in the end this is no different from what you have
> >>>> there right now for copying the vCPU info area. In the final patch
> >>>> that other code gets removed by re-using the code here.
> >>>
> >>> Yes, the copy part is fork-specific. Arguably if there was a way to do
> > the
> >>> allocation of the page for vcpu_info I would prefer that being
> > elsewhere,
> >>> but while the only requirement is allocate-page and copy from parent
> > I'm OK
> >>> with that logic being in here because it's really straight forward.
But
> > now
> >>> you also do extra sanity checks here which are harder to comprehend in
> > this
> >>> context alone.
> >>
> >> What sanity checks are you talking about (also below, where you claim
> >> map_guest_area() would be used only to sanity check)?
> >
> > Did I misread your comment above "All the function uses it for is a
check
> > for not crossing page boundaries"? That sounds to me like a simple
sanity
> > check, unclear why it matters though and why only for forks.
>
> The comment is about the function's use of the range it is being passed.
> It doesn't say in any way that the function is doing only sanity checking.
> If the comment wording is ambiguous or unclear, I'm happy to take
> improvement suggestions.

Yes, please do, it definitely was confusing while reviewing the patch.

Thanks,
Tamas
Jan Beulich Jan. 26, 2023, 4:48 p.m. UTC | #8
On 26.01.2023 16:41, Tamas K Lengyel wrote:
> On Thu, Jan 26, 2023 at 3:14 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 25.01.2023 16:34, Tamas K Lengyel wrote:
>>> On Tue, Jan 24, 2023 at 6:19 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>>
>>>> On 23.01.2023 19:32, Tamas K Lengyel wrote:
>>>>> On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com>
> wrote:
>>>>>> On 23.01.2023 17:09, Tamas K Lengyel wrote:
>>>>>>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com>
> wrote:
>>>>>>>> --- a/xen/arch/x86/mm/mem_sharing.c
>>>>>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
>>>>>>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
>>>>>>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static int copy_guest_area(struct guest_area *cd_area,
>>>>>>>> +                           const struct guest_area *d_area,
>>>>>>>> +                           struct vcpu *cd_vcpu,
>>>>>>>> +                           const struct domain *d)
>>>>>>>> +{
>>>>>>>> +    mfn_t d_mfn, cd_mfn;
>>>>>>>> +
>>>>>>>> +    if ( !d_area->pg )
>>>>>>>> +        return 0;
>>>>>>>> +
>>>>>>>> +    d_mfn = page_to_mfn(d_area->pg);
>>>>>>>> +
>>>>>>>> +    /* Allocate & map a page for the area if it hasn't been
> already.
>>>>> */
>>>>>>>> +    if ( !cd_area->pg )
>>>>>>>> +    {
>>>>>>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>>>>>>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>>>>>>>> +        p2m_type_t p2mt;
>>>>>>>> +        p2m_access_t p2ma;
>>>>>>>> +        unsigned int offset;
>>>>>>>> +        int ret;
>>>>>>>> +
>>>>>>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
>>>>> NULL);
>>>>>>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>>>>>>>> +        {
>>>>>>>> +            struct page_info *pg =
>>> alloc_domheap_page(cd_vcpu->domain,
>>>>>>> 0);
>>>>>>>> +
>>>>>>>> +            if ( !pg )
>>>>>>>> +                return -ENOMEM;
>>>>>>>> +
>>>>>>>> +            cd_mfn = page_to_mfn(pg);
>>>>>>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>>>>>>>> +
>>>>>>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K,
>>>>>>> p2m_ram_rw,
>>>>>>>> +                                 p2m->default_access, -1);
>>>>>>>> +            if ( ret )
>>>>>>>> +                return ret;
>>>>>>>> +        }
>>>>>>>> +        else if ( p2mt != p2m_ram_rw )
>>>>>>>> +            return -EBUSY;
>>>>>>>> +
>>>>>>>> +        /*
>>>>>>>> +         * Simply specify the entire range up to the end of the
>>> page.
>>>>>>> All the
>>>>>>>> +         * function uses it for is a check for not crossing page
>>>>>>> boundaries.
>>>>>>>> +         */
>>>>>>>> +        offset = PAGE_OFFSET(d_area->map);
>>>>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>>>>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
>>>>>>>> +        if ( ret )
>>>>>>>> +            return ret;
>>>>>>>> +    }
>>>>>>>> +    else
>>>>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
>>>>>>>
>>>>>>> Everything to this point seems to be non mem-sharing/forking
> related.
>>>>> Could
>>>>>>> these live somewhere else? There must be some other place where
>>>>> allocating
>>>>>>> these areas happens already for non-fork VMs so it would make sense
> to
>>>>> just
>>>>>>> refactor that code to be callable from here.
>>>>>>
>>>>>> It is the "copy" aspect with makes this mem-sharing (or really fork)
>>>>>> specific. Plus in the end this is no different from what you have
>>>>>> there right now for copying the vCPU info area. In the final patch
>>>>>> that other code gets removed by re-using the code here.
>>>>>
>>>>> Yes, the copy part is fork-specific. Arguably if there was a way to do
>>> the
>>>>> allocation of the page for vcpu_info I would prefer that being
>>> elsewhere,
>>>>> but while the only requirement is allocate-page and copy from parent
>>> I'm OK
>>>>> with that logic being in here because it's really straight forward.
> But
>>> now
>>>>> you also do extra sanity checks here which are harder to comprehend in
>>> this
>>>>> context alone.
>>>>
>>>> What sanity checks are you talking about (also below, where you claim
>>>> map_guest_area() would be used only to sanity check)?
>>>
>>> Did I misread your comment above "All the function uses it for is a
> check
>>> for not crossing page boundaries"? That sounds to me like a simple
> sanity
>>> check, unclear why it matters though and why only for forks.
>>
>> The comment is about the function's use of the range it is being passed.
>> It doesn't say in any way that the function is doing only sanity checking.
>> If the comment wording is ambiguous or unclear, I'm happy to take
>> improvement suggestions.
> 
> Yes, please do, it definitely was confusing while reviewing the patch.

I'm sorry, but what does "please do" mean when I asked for improvement
suggestions? I continue to think the comment is quite clear as is, so
if anything needs adjusting, I'd need to know pretty precisely what it
is that needs adding and/or re-writing. I can't, after all, guess what
your misunderstanding resulted from.

Jan
Tamas K Lengyel Jan. 26, 2023, 5:24 p.m. UTC | #9
On Thu, Jan 26, 2023 at 11:48 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 26.01.2023 16:41, Tamas K Lengyel wrote:
> > On Thu, Jan 26, 2023 at 3:14 AM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> On 25.01.2023 16:34, Tamas K Lengyel wrote:
> >>> On Tue, Jan 24, 2023 at 6:19 AM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>
> >>>> On 23.01.2023 19:32, Tamas K Lengyel wrote:
> >>>>> On Mon, Jan 23, 2023 at 11:24 AM Jan Beulich <jbeulich@suse.com>
> > wrote:
> >>>>>> On 23.01.2023 17:09, Tamas K Lengyel wrote:
> >>>>>>> On Mon, Jan 23, 2023 at 9:55 AM Jan Beulich <jbeulich@suse.com>
> > wrote:
> >>>>>>>> --- a/xen/arch/x86/mm/mem_sharing.c
> >>>>>>>> +++ b/xen/arch/x86/mm/mem_sharing.c
> >>>>>>>> @@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
> >>>>>>>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>>>>>>>  }
> >>>>>>>>
> >>>>>>>> +static int copy_guest_area(struct guest_area *cd_area,
> >>>>>>>> +                           const struct guest_area *d_area,
> >>>>>>>> +                           struct vcpu *cd_vcpu,
> >>>>>>>> +                           const struct domain *d)
> >>>>>>>> +{
> >>>>>>>> +    mfn_t d_mfn, cd_mfn;
> >>>>>>>> +
> >>>>>>>> +    if ( !d_area->pg )
> >>>>>>>> +        return 0;
> >>>>>>>> +
> >>>>>>>> +    d_mfn = page_to_mfn(d_area->pg);
> >>>>>>>> +
> >>>>>>>> +    /* Allocate & map a page for the area if it hasn't been
> > already.
> >>>>> */
> >>>>>>>> +    if ( !cd_area->pg )
> >>>>>>>> +    {
> >>>>>>>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >>>>>>>> +        struct p2m_domain *p2m =
p2m_get_hostp2m(cd_vcpu->domain);
> >>>>>>>> +        p2m_type_t p2mt;
> >>>>>>>> +        p2m_access_t p2ma;
> >>>>>>>> +        unsigned int offset;
> >>>>>>>> +        int ret;
> >>>>>>>> +
> >>>>>>>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL,
> >>>>> NULL);
> >>>>>>>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >>>>>>>> +        {
> >>>>>>>> +            struct page_info *pg =
> >>> alloc_domheap_page(cd_vcpu->domain,
> >>>>>>> 0);
> >>>>>>>> +
> >>>>>>>> +            if ( !pg )
> >>>>>>>> +                return -ENOMEM;
> >>>>>>>> +
> >>>>>>>> +            cd_mfn = page_to_mfn(pg);
> >>>>>>>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >>>>>>>> +
> >>>>>>>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn,
PAGE_ORDER_4K,
> >>>>>>> p2m_ram_rw,
> >>>>>>>> +                                 p2m->default_access, -1);
> >>>>>>>> +            if ( ret )
> >>>>>>>> +                return ret;
> >>>>>>>> +        }
> >>>>>>>> +        else if ( p2mt != p2m_ram_rw )
> >>>>>>>> +            return -EBUSY;
> >>>>>>>> +
> >>>>>>>> +        /*
> >>>>>>>> +         * Simply specify the entire range up to the end of the
> >>> page.
> >>>>>>> All the
> >>>>>>>> +         * function uses it for is a check for not crossing page
> >>>>>>> boundaries.
> >>>>>>>> +         */
> >>>>>>>> +        offset = PAGE_OFFSET(d_area->map);
> >>>>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) +
offset,
> >>>>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
> >>>>>>>> +        if ( ret )
> >>>>>>>> +            return ret;
> >>>>>>>> +    }
> >>>>>>>> +    else
> >>>>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
> >>>>>>>
> >>>>>>> Everything to this point seems to be non mem-sharing/forking
> > related.
> >>>>> Could
> >>>>>>> these live somewhere else? There must be some other place where
> >>>>> allocating
> >>>>>>> these areas happens already for non-fork VMs so it would make
sense
> > to
> >>>>> just
> >>>>>>> refactor that code to be callable from here.
> >>>>>>
> >>>>>> It is the "copy" aspect with makes this mem-sharing (or really
fork)
> >>>>>> specific. Plus in the end this is no different from what you have
> >>>>>> there right now for copying the vCPU info area. In the final patch
> >>>>>> that other code gets removed by re-using the code here.
> >>>>>
> >>>>> Yes, the copy part is fork-specific. Arguably if there was a way to
do
> >>> the
> >>>>> allocation of the page for vcpu_info I would prefer that being
> >>> elsewhere,
> >>>>> but while the only requirement is allocate-page and copy from parent
> >>> I'm OK
> >>>>> with that logic being in here because it's really straight forward.
> > But
> >>> now
> >>>>> you also do extra sanity checks here which are harder to comprehend
in
> >>> this
> >>>>> context alone.
> >>>>
> >>>> What sanity checks are you talking about (also below, where you claim
> >>>> map_guest_area() would be used only to sanity check)?
> >>>
> >>> Did I misread your comment above "All the function uses it for is a
> > check
> >>> for not crossing page boundaries"? That sounds to me like a simple
> > sanity
> >>> check, unclear why it matters though and why only for forks.
> >>
> >> The comment is about the function's use of the range it is being
passed.
> >> It doesn't say in any way that the function is doing only sanity
checking.
> >> If the comment wording is ambiguous or unclear, I'm happy to take
> >> improvement suggestions.
> >
> > Yes, please do, it definitely was confusing while reviewing the patch.
>
> I'm sorry, but what does "please do" mean when I asked for improvement
> suggestions? I continue to think the comment is quite clear as is, so
> if anything needs adjusting, I'd need to know pretty precisely what it
> is that needs adding and/or re-writing. I can't, after all, guess what
> your misunderstanding resulted from.

I meant please do add the extra information you just spelled out in your
previous email. "Map the page into the guest. We pass the entire range to
map_guest_area so that it can check that no cross-page mapping is taking
place (in which case it will fail). If no such issue is present the page is
being mapped and made accessible to the guest."

If that's not what the function is doing, please explain clearly what it
will do.

Tamas
diff mbox series

Patch

--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1653,6 +1653,65 @@  static void copy_vcpu_nonreg_state(struc
     hvm_set_nonreg_state(cd_vcpu, &nrs);
 }
 
+static int copy_guest_area(struct guest_area *cd_area,
+                           const struct guest_area *d_area,
+                           struct vcpu *cd_vcpu,
+                           const struct domain *d)
+{
+    mfn_t d_mfn, cd_mfn;
+
+    if ( !d_area->pg )
+        return 0;
+
+    d_mfn = page_to_mfn(d_area->pg);
+
+    /* Allocate & map a page for the area if it hasn't been already. */
+    if ( !cd_area->pg )
+    {
+        gfn_t gfn = mfn_to_gfn(d, d_mfn);
+        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
+        p2m_type_t p2mt;
+        p2m_access_t p2ma;
+        unsigned int offset;
+        int ret;
+
+        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
+        if ( mfn_eq(cd_mfn, INVALID_MFN) )
+        {
+            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
+
+            if ( !pg )
+                return -ENOMEM;
+
+            cd_mfn = page_to_mfn(pg);
+            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
+
+            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                                 p2m->default_access, -1);
+            if ( ret )
+                return ret;
+        }
+        else if ( p2mt != p2m_ram_rw )
+            return -EBUSY;
+
+        /*
+         * Simply specify the entire range up to the end of the page. All the
+         * function uses it for is a check for not crossing page boundaries.
+         */
+        offset = PAGE_OFFSET(d_area->map);
+        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
+                             PAGE_SIZE - offset, cd_area, NULL);
+        if ( ret )
+            return ret;
+    }
+    else
+        cd_mfn = page_to_mfn(cd_area->pg);
+
+    copy_domain_page(cd_mfn, d_mfn);
+
+    return 0;
+}
+
 static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
 {
     struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
@@ -1745,6 +1804,16 @@  static int copy_vcpu_settings(struct dom
             copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
         }
 
+        /* Same for the (physically registered) runstate and time info areas. */
+        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
+                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
+                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+
         ret = copy_vpmu(d_vcpu, cd_vcpu);
         if ( ret )
             return ret;
@@ -1987,7 +2056,10 @@  int mem_sharing_fork_reset(struct domain
 
  state:
     if ( reset_state )
+    {
         rc = copy_settings(d, pd);
+        /* TBD: What to do here with -ERESTART? */
+    }
 
     domain_unpause(d);
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1572,6 +1572,13 @@  void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
+int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
+                   struct guest_area *area,
+                   void (*populate)(void *dst, struct vcpu *v))
+{
+    return -EOPNOTSUPP;
+}
+
 /*
  * This is only intended to be used for domain cleanup (or more generally only
  * with at least the respective vCPU, if it's not the current one, reliably