diff mbox

[v2] dm_op: Add xendevicemodel_modified_memory_bulk.

Message ID 1490104791-32638-1-git-send-email-jennifer.herbert@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jennifer Herbert March 21, 2017, 1:59 p.m. UTC
From: Jennifer Herbert <Jennifer.Herbert@citrix.com>

This new lib devicemodel call allows multiple extents of pages to be
marked as modified in a single call.  This is something needed for a
usecase I'm working on.

The xen side of the modified_memory call has been modified to accept
an array of extents.  The devicemodle library either provides an array
of length 1, to support the original library function, or a second
function allows an array to be provided.

Signed-off-by: Jennifer Herbert <Jennifer.Herbert@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Paul Durrant <paul.durrant@citrix.com>
---

This version of this patch removes the need for the 'offset' parameter,
by instead reducing nr_extents, and working backwards from the end of
the array.

This patch also removes the need to ever write back the passed array of
extents to the guest, but instead putting its partial progress in a 
'pfns_processed' parameter, which replaces the old 'offset' parameter.
As with the 'offest' parameter, 'pfns_processed' should be set to 0 on 
calling.

 tools/libs/devicemodel/core.c                   |   30 +++++--
 tools/libs/devicemodel/include/xendevicemodel.h |   19 +++-
 xen/arch/x86/hvm/dm.c                           |  110 ++++++++++++++---------
 xen/include/public/hvm/dm_op.h                  |   17 +++-
 4 files changed, 122 insertions(+), 54 deletions(-)

Comments

Jan Beulich March 22, 2017, 10:33 a.m. UTC | #1
>>> On 21.03.17 at 14:59, <jennifer.herbert@citrix.com> wrote:
> This version of this patch removes the need for the 'offset' parameter,
> by instead reducing nr_extents, and working backwards from the end of
> the array.
> 
> This patch also removes the need to ever write back the passed array of
> extents to the guest, but instead putting its partial progress in a 
> 'pfns_processed' parameter, which replaces the old 'offset' parameter.
> As with the 'offest' parameter, 'pfns_processed' should be set to 0 on 
> calling.

So how is the need for 'pfns_processed' better than the prior need
for 'offset'? I continue to not see why you'd need any extra field, as
you can write back to 'first_pfn' and 'nr' just like the old code did. (But
note that I'm not outright objecting to this solution, I'd first like to
understand why it was chosen.)

> --- a/xen/arch/x86/hvm/dm.c
> +++ b/xen/arch/x86/hvm/dm.c
> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t isa_irq,
>  }
>  
>  static int modified_memory(struct domain *d,
> -                           struct xen_dm_op_modified_memory *data)
> +                           struct xen_dm_op_modified_memory *header,
> +                           xen_dm_op_buf_t* buf)
>  {
> -    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
> -    unsigned int iter = 0;
> +    /* Process maximum of 256 pfns before checking for continuation */
> +    const unsigned int cont_check_interval = 0xff;

Iirc the question was asked on v1 already: 256 (as the comment
says) or 255 (as the value enforces)?

>      int rc = 0;
> -
> -    if ( (data->first_pfn > last_pfn) ||
> -         (last_pfn > domain_get_maximum_gpfn(d)) )
> -        return -EINVAL;
> +    unsigned int rem_extents =  header->nr_extents;
> +    unsigned int batch_rem_pfns = cont_check_interval;
>  
>      if ( !paging_mode_log_dirty(d) )
>          return 0;
>  
> -    while ( iter < data->nr )
> +    if ( (buf->size / sizeof(struct xen_dm_op_modified_memory_extent)) <
> +         header->nr_extents )
> +        return -EINVAL;
> +
> +    while ( rem_extents > 0)
>      {
> -        unsigned long pfn = data->first_pfn + iter;
> -        struct page_info *page;
> +        struct xen_dm_op_modified_memory_extent extent;
> +        unsigned int batch_nr;
> +        xen_pfn_t pfn;
> +        xen_pfn_t end_pfn;
>  
> -        page = get_page_from_gfn(d, pfn, NULL, P2M_UNSHARE);
> -        if ( page )
> +        if ( copy_from_guest_offset(&extent, buf->h, rem_extents - 1, 1) )
> +            return -EFAULT;
> +
> +        pfn = extent.first_pfn + header->pfns_processed;
> +        batch_nr = extent.nr - header->pfns_processed;

Even if this looks to be harmless to the hypervisor, I dislike the
overflows you allow for here.

> @@ -441,13 +474,8 @@ static int dm_op(domid_t domid,
>          struct xen_dm_op_modified_memory *data =
>              &op.u.modified_memory;
>  
> -        const_op = false;
> -
> -        rc = -EINVAL;
> -        if ( data->pad )
> -            break;

Where did this check go?

> --- a/xen/include/public/hvm/dm_op.h
> +++ b/xen/include/public/hvm/dm_op.h
> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>   * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>   *                           an emulator.
>   *
> - * NOTE: In the event of a continuation, the @first_pfn is set to the
> - *       value of the pfn of the remaining set of pages and @nr reduced
> - *       to the size of the remaining set.
> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
> + *  continuation.
> + *
> + * @pfns_processed is used for continuation, and not intended to be usefull
> + * to the caller.  It gives the number of pfns already processed (and can
> + * be skipped) in the last extent. This should always be set to zero.
>   */
>  #define XEN_DMOP_modified_memory 11
>  
>  struct xen_dm_op_modified_memory {
> +    /* IN - number of extents. */
> +    uint32_t nr_extents;
> +    /* IN - should be set to 0, and is updated on continuation. */
> +    uint32_t pfns_processed;

I'd prefer if the field (if to be kept) wasn't named after its current
purpose, nor should we state anything beyond it needing to be
zero when first invoking the call. These are implementation details
which callers should not rely on.

Jan
Jennifer Herbert March 22, 2017, 7:55 p.m. UTC | #2
On 22/03/17 10:33, Jan Beulich wrote:
>>>> On 21.03.17 at 14:59, <jennifer.herbert@citrix.com> wrote:
>> This version of this patch removes the need for the 'offset' parameter,
>> by instead reducing nr_extents, and working backwards from the end of
>> the array.
>>
>> This patch also removes the need to ever write back the passed array of
>> extents to the guest, but instead putting its partial progress in a
>> 'pfns_processed' parameter, which replaces the old 'offset' parameter.
>> As with the 'offest' parameter, 'pfns_processed' should be set to 0 on
>> calling.
> So how is the need for 'pfns_processed' better than the prior need
> for 'offset'? I continue to not see why you'd need any extra field, as
> you can write back to 'first_pfn' and 'nr' just like the old code did. (But
> note that I'm not outright objecting to this solution, I'd first like to
> understand why it was chosen.)
>

With v1 of the patch, on continuation, two buffers get written back to 
guest, one
to update 'offset', and one to update first_pfn (in the array). Although 
I can't think of an example
where this would be a problem, I think that semi-randomly altering items 
in the passed
array may not be expected, and if that data was re-used for something, 
could cause a bug.

There is precedence for changing the op buffer,  and using 
pfns_processed means we never have
to change this array, which I think is much cleaner, intuitive, and 
halving the copy backs.

I considered your suggestion of modifying the handle array, but I think 
this would make the code much harder to follow,  and still require data 
written back to the guest - just in a different place. I thought just 
reducing the nr_extents a cleaner solution.

I can't see the problem with having an argument for continuation - the 
xen_dm_op will remain the same size, if we use the argument space or 
not.  If its just about how the API looks, I find this highly subjective 
- its no secret that continuations happen, and data gets stashed in 
these parameters - and so i see no reason why having a dedicated 
parameter for this is a problem.

If we want to hide this continuation information, we could define:

struct xen_dm_op_modified_memory {
     /* IN - number of extents. */
     uint32_t nr_extents;
}

struct xen_dm_op_modified_memory modified_memory_expanded {
     struct xen_dm_op_modified_memory modified_memory;
     uint32_t  pfns_processed
}

and include both structures in the xen_dm_op union.
The caller would use xen_dm_op_modified_memory modified_memory, and due 
to the memset(&op, 0, sizeof(op));, the value for pfns_processed would 
end up as zero.  Xen could use expanded structure, and make use of 
pfns_processed.

Or, we could re-purpose the 'pad' member of struct xen_dm_op, call it 
continuation_data, and have this as a general purpose continuation 
variable, that the caller wouldn't pay any attention to.

What do you think?

>> --- a/xen/arch/x86/hvm/dm.c
>> +++ b/xen/arch/x86/hvm/dm.c
>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t isa_irq,
>>   }
>>   
>>   static int modified_memory(struct domain *d,
>> -                           struct xen_dm_op_modified_memory *data)
>> +                           struct xen_dm_op_modified_memory *header,
>> +                           xen_dm_op_buf_t* buf)
>>   {
>> -    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
>> -    unsigned int iter = 0;
>> +    /* Process maximum of 256 pfns before checking for continuation */
>> +    const unsigned int cont_check_interval = 0xff;
> Iirc the question was asked on v1 already: 256 (as the comment
> says) or 255 (as the value enforces)?

aw shucks!  I had it as 255, and Paul said "255 or 256?" and I changed 
it to 256, without thinking, assuming an off by one error, and that he 
was right.  But, with a seconds thought, it should be 255, and Paul  was 
actually referring to a comment later in the code, which was 256.

>>       int rc = 0;
>>       if ( copy_from_guest_offset(&extent, buf->h, rem_extents - 1, 1) )
>> +            return -EFAULT;
>> +
>> +        pfn = extent.first_pfn + header->pfns_processed;
>> +        batch_nr = extent.nr - header->pfns_processed;
> Even if this looks to be harmless to the hypervisor, I dislike the
> overflows you allow for here.

I'll add a check for (header->pfns_processed > extent.nr), returning 
-EINVAL.

>> @@ -441,13 +474,8 @@ static int dm_op(domid_t domid,
>>           struct xen_dm_op_modified_memory *data =
>>               &op.u.modified_memory;
>>   
>> -        const_op = false;
>> -
>> -        rc = -EINVAL;
>> -        if ( data->pad )
>> -            break;
> Where did this check go?

data->pad is undefined here.  But I could add a check for extent.pad in 
the modified_memory function.

>> --- a/xen/include/public/hvm/dm_op.h
>> +++ b/xen/include/public/hvm/dm_op.h
>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>    * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>    *                           an emulator.
>>    *
>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>> - *       value of the pfn of the remaining set of pages and @nr reduced
>> - *       to the size of the remaining set.
>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>> + *  continuation.
>> + *
>> + * @pfns_processed is used for continuation, and not intended to be usefull
>> + * to the caller.  It gives the number of pfns already processed (and can
>> + * be skipped) in the last extent. This should always be set to zero.
>>    */
>>   #define XEN_DMOP_modified_memory 11
>>   
>>   struct xen_dm_op_modified_memory {
>> +    /* IN - number of extents. */
>> +    uint32_t nr_extents;
>> +    /* IN - should be set to 0, and is updated on continuation. */
>> +    uint32_t pfns_processed;
> I'd prefer if the field (if to be kept) wasn't named after its current
> purpose, nor should we state anything beyond it needing to be
> zero when first invoking the call. These are implementation details
> which callers should not rely on.

Assuming we keep it, how about calling it "reserved", with a comment in 
dm.c explaining what its actually for?

Let me know what you think,

Cheers,

-jenny
Jan Beulich March 23, 2017, 8:35 a.m. UTC | #3
>>> On 22.03.17 at 20:55, <Jennifer.Herbert@citrix.com> wrote:
> On 22/03/17 10:33, Jan Beulich wrote:
>>>>> On 21.03.17 at 14:59, <jennifer.herbert@citrix.com> wrote:
>>> This version of this patch removes the need for the 'offset' parameter,
>>> by instead reducing nr_extents, and working backwards from the end of
>>> the array.
>>>
>>> This patch also removes the need to ever write back the passed array of
>>> extents to the guest, but instead putting its partial progress in a
>>> 'pfns_processed' parameter, which replaces the old 'offset' parameter.
>>> As with the 'offest' parameter, 'pfns_processed' should be set to 0 on
>>> calling.
>> So how is the need for 'pfns_processed' better than the prior need
>> for 'offset'? I continue to not see why you'd need any extra field, as
>> you can write back to 'first_pfn' and 'nr' just like the old code did. (But
>> note that I'm not outright objecting to this solution, I'd first like to
>> understand why it was chosen.)
>>
> 
> With v1 of the patch, on continuation, two buffers get written back to 
> guest, one
> to update 'offset', and one to update first_pfn (in the array). Although 
> I can't think of an example
> where this would be a problem, I think that semi-randomly altering items 
> in the passed
> array may not be expected, and if that data was re-used for something, 
> could cause a bug.
> 
> There is precedence for changing the op buffer,  and using 
> pfns_processed means we never have
> to change this array, which I think is much cleaner, intuitive, and 
> halving the copy backs.
> 
> I considered your suggestion of modifying the handle array, but I think 
> this would make the code much harder to follow,  and still require data 
> written back to the guest - just in a different place. I thought just 
> reducing the nr_extents a cleaner solution.

This backwards processing was a neat idea actually, since here
we definitely don't have any ordering requirement.

> I can't see the problem with having an argument for continuation - the 
> xen_dm_op will remain the same size, if we use the argument space or 
> not.  If its just about how the API looks, I find this highly subjective 
> - its no secret that continuations happen, and data gets stashed in 
> these parameters - and so i see no reason why having a dedicated 
> parameter for this is a problem.
> 
> If we want to hide this continuation information, we could define:
> 
> struct xen_dm_op_modified_memory {
>      /* IN - number of extents. */
>      uint32_t nr_extents;
> }
> 
> struct xen_dm_op_modified_memory modified_memory_expanded {
>      struct xen_dm_op_modified_memory modified_memory;
>      uint32_t  pfns_processed
> }
> 
> and include both structures in the xen_dm_op union.
> The caller would use xen_dm_op_modified_memory modified_memory, and due 
> to the memset(&op, 0, sizeof(op));, the value for pfns_processed would 
> end up as zero.  Xen could use expanded structure, and make use of 
> pfns_processed.

I don't think this would be helpful.

> Or, we could re-purpose the 'pad' member of struct xen_dm_op, call it 
> continuation_data, and have this as a general purpose continuation 
> variable, that the caller wouldn't pay any attention to.
> 
> What do you think?

No, that's not really an option here imo, as in the general case we'd
need this to be a 64-bit value. It just so happens that you can get
away with a 32-bit one because you limit the input value to 32 bits.

>>> --- a/xen/arch/x86/hvm/dm.c
>>> +++ b/xen/arch/x86/hvm/dm.c
>>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t isa_irq,
>>>   }
>>>   
>>>   static int modified_memory(struct domain *d,
>>> -                           struct xen_dm_op_modified_memory *data)
>>> +                           struct xen_dm_op_modified_memory *header,
>>> +                           xen_dm_op_buf_t* buf)
>>>   {
>>> -    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
>>> -    unsigned int iter = 0;
>>> +    /* Process maximum of 256 pfns before checking for continuation */
>>> +    const unsigned int cont_check_interval = 0xff;
>> Iirc the question was asked on v1 already: 256 (as the comment
>> says) or 255 (as the value enforces)?
> 
> aw shucks!  I had it as 255, and Paul said "255 or 256?" and I changed 
> it to 256, without thinking, assuming an off by one error, and that he 
> was right.  But, with a seconds thought, it should be 255, and Paul  was 
> actually referring to a comment later in the code, which was 256.

Yet batches of 256 would seem a little more "usual".

>>>       int rc = 0;
>>>       if ( copy_from_guest_offset(&extent, buf->h, rem_extents - 1, 1) )
>>> +            return -EFAULT;
>>> +
>>> +        pfn = extent.first_pfn + header->pfns_processed;
>>> +        batch_nr = extent.nr - header->pfns_processed;
>> Even if this looks to be harmless to the hypervisor, I dislike the
>> overflows you allow for here.
> 
> I'll add a check for (header->pfns_processed > extent.nr), returning 
> -EINVAL.

>= actually, I think.

>>> @@ -441,13 +474,8 @@ static int dm_op(domid_t domid,
>>>           struct xen_dm_op_modified_memory *data =
>>>               &op.u.modified_memory;
>>>   
>>> -        const_op = false;
>>> -
>>> -        rc = -EINVAL;
>>> -        if ( data->pad )
>>> -            break;
>> Where did this check go?
> 
> data->pad is undefined here.  But I could add a check for extent.pad in 
> the modified_memory function.

That's what I'm asking for. Talking of which - is there a way for the
caller to know on which extent an error (if any) has occurred? This
certainly needs to be possible.

>>> --- a/xen/include/public/hvm/dm_op.h
>>> +++ b/xen/include/public/hvm/dm_op.h
>>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>>    * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>>    *                           an emulator.
>>>    *
>>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>>> - *       value of the pfn of the remaining set of pages and @nr reduced
>>> - *       to the size of the remaining set.
>>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>>> + *  continuation.
>>> + *
>>> + * @pfns_processed is used for continuation, and not intended to be usefull
>>> + * to the caller.  It gives the number of pfns already processed (and can
>>> + * be skipped) in the last extent. This should always be set to zero.
>>>    */
>>>   #define XEN_DMOP_modified_memory 11
>>>   
>>>   struct xen_dm_op_modified_memory {
>>> +    /* IN - number of extents. */
>>> +    uint32_t nr_extents;
>>> +    /* IN - should be set to 0, and is updated on continuation. */
>>> +    uint32_t pfns_processed;
>> I'd prefer if the field (if to be kept) wasn't named after its current
>> purpose, nor should we state anything beyond it needing to be
>> zero when first invoking the call. These are implementation details
>> which callers should not rely on.
> 
> Assuming we keep it, how about calling it "reserved", with a comment in 
> dm.c explaining what its actually for?

Elsewhere we use "opaque", but "reserved" is probably fine too.
However, we may want to name it as an OUT value, for the
error-on-extent indication mentioned above (of course another
option would be to make nr_extent IN/OUT). As an OUT, we're
free to use it for whatever intermediate value, just so long as
upon final return to the caller it has the intended value. (It's
debatable whether it should be IN/OUT, due to the need for it
to be set to zero.)

Jan
Jennifer Herbert March 23, 2017, 2:54 p.m. UTC | #4
On 23/03/17 08:35, Jan Beulich wrote:
>>>> On 22.03.17 at 20:55, <Jennifer.Herbert@citrix.com> wrote:
>>>> --- a/xen/arch/x86/hvm/dm.c
>>>> +++ b/xen/arch/x86/hvm/dm.c
>>>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t isa_irq,
>>>>    }
>>>>    
>>>>    static int modified_memory(struct domain *d,
>>>> -                           struct xen_dm_op_modified_memory *data)
>>>> +                           struct xen_dm_op_modified_memory *header,
>>>> +                           xen_dm_op_buf_t* buf)
>>>>    {
>>>> -    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
>>>> -    unsigned int iter = 0;
>>>> +    /* Process maximum of 256 pfns before checking for continuation */
>>>> +    const unsigned int cont_check_interval = 0xff;
>>> Iirc the question was asked on v1 already: 256 (as the comment
>>> says) or 255 (as the value enforces)?
>> aw shucks!  I had it as 255, and Paul said "255 or 256?" and I changed
>> it to 256, without thinking, assuming an off by one error, and that he
>> was right.  But, with a seconds thought, it should be 255, and Paul  was
>> actually referring to a comment later in the code, which was 256.
> Yet batches of 256 would seem a little more "usual".

Ok, I can set cont_check_interval = 0x100; and leave the comment as 256.

>>>> --- a/xen/include/public/hvm/dm_op.h
>>>> +++ b/xen/include/public/hvm/dm_op.h
>>>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>>>     * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>>>     *                           an emulator.
>>>>     *
>>>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>>>> - *       value of the pfn of the remaining set of pages and @nr reduced
>>>> - *       to the size of the remaining set.
>>>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>>>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>>>> + *  continuation.
>>>> + *
>>>> + * @pfns_processed is used for continuation, and not intended to be usefull
>>>> + * to the caller.  It gives the number of pfns already processed (and can
>>>> + * be skipped) in the last extent. This should always be set to zero.
>>>>     */
>>>>    #define XEN_DMOP_modified_memory 11
>>>>    
>>>>    struct xen_dm_op_modified_memory {
>>>> +    /* IN - number of extents. */
>>>> +    uint32_t nr_extents;
>>>> +    /* IN - should be set to 0, and is updated on continuation. */
>>>> +    uint32_t pfns_processed;
>>> I'd prefer if the field (if to be kept) wasn't named after its current
>>> purpose, nor should we state anything beyond it needing to be
>>> zero when first invoking the call. These are implementation details
>>> which callers should not rely on.
>> Assuming we keep it, how about calling it "reserved", with a comment in
>> dm.c explaining what its actually for?
> Elsewhere we use "opaque", but "reserved" is probably fine too.
> However, we may want to name it as an OUT value, for the
> error-on-extent indication mentioned above (of course another
> option would be to make nr_extent IN/OUT). As an OUT, we're
> free to use it for whatever intermediate value, just so long as
> upon final return to the caller it has the intended value. (It's
> debatable whether it should be IN/OUT, due to the need for it
> to be set to zero.)
>
> Jan

Having an indication of which extent failed seem a sensible idea. We'd 
need that
parameter to be initially set to something that can represent none of 
the extents,
such that if there is an error before we get to precessing the extents, 
this is clear.

If we used nr_extents - the caller can check its the same value they 
passed in.
If the value is the same, then the error occurred before processing an 
extent.

If we used  error_on_extent (previously known as pfns_processed), we 
could not
use zero, as it might be the first extent to fail.  In this case, we'd 
need to specify
it was defaulted to 0xFFFFFFFF.  Should probably define a constant - 
NO_EXTENT.
Internally, we could just invert the value for using as a pfn_processed 
intermediately.

However, both of these ideas suffer a problem, that if a continuation 
fails before processing
the remaining extents, the returned value would be incorrect.  For 
reusing nr_extents, it
would appear to be the previous extent - potentially very confusing.   
Using
error_on_extent would give you a totally different value, although very 
likely, one that
is greater then the number of extents passed in.

Both of these schemes can be solved by only allowing 31 bit number of 
extents and pfns,
and having the fields signed, to make it more obvious.
For both schemes, you could return -extent_nr, where only a negative 
value indicates
and extent number.  This would allow you to have the initial value of 0 
for error_on_extent.

Alternatively, for the error_on_extent scheme, the pfns_processed could 
be stored as a negative,
which would mean that if a negative value got returned it was was no 
specific extent.
The value would need initialised by the caller as -1.

An alternative to using signed values would be to use a combination of 
the two values.
If nr_extents == 0, then error_on_extent hold the extent number.  It 
allows full 32 bit values
but is kinda odd.


I'm happy to do any of the 4 schemes - they all have pros and cons.

What do you think?

Cheers,

-jenny
Jan Beulich March 23, 2017, 3:58 p.m. UTC | #5
>>> On 23.03.17 at 15:54, <Jennifer.Herbert@citrix.com> wrote:
> On 23/03/17 08:35, Jan Beulich wrote:
>>>>> On 22.03.17 at 20:55, <Jennifer.Herbert@citrix.com> wrote:
>>>>> --- a/xen/arch/x86/hvm/dm.c
>>>>> +++ b/xen/arch/x86/hvm/dm.c
>>>>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t 
> isa_irq,
>>>>>    }
>>>>>    
>>>>>    static int modified_memory(struct domain *d,
>>>>> -                           struct xen_dm_op_modified_memory *data)
>>>>> +                           struct xen_dm_op_modified_memory *header,
>>>>> +                           xen_dm_op_buf_t* buf)
>>>>>    {
>>>>> -    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
>>>>> -    unsigned int iter = 0;
>>>>> +    /* Process maximum of 256 pfns before checking for continuation */
>>>>> +    const unsigned int cont_check_interval = 0xff;
>>>> Iirc the question was asked on v1 already: 256 (as the comment
>>>> says) or 255 (as the value enforces)?
>>> aw shucks!  I had it as 255, and Paul said "255 or 256?" and I changed
>>> it to 256, without thinking, assuming an off by one error, and that he
>>> was right.  But, with a seconds thought, it should be 255, and Paul  was
>>> actually referring to a comment later in the code, which was 256.
>> Yet batches of 256 would seem a little more "usual".
> 
> Ok, I can set cont_check_interval = 0x100; and leave the comment as 256.
> 
>>>>> --- a/xen/include/public/hvm/dm_op.h
>>>>> +++ b/xen/include/public/hvm/dm_op.h
>>>>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>>>>     * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>>>>     *                           an emulator.
>>>>>     *
>>>>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>>>>> - *       value of the pfn of the remaining set of pages and @nr reduced
>>>>> - *       to the size of the remaining set.
>>>>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>>>>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>>>>> + *  continuation.
>>>>> + *
>>>>> + * @pfns_processed is used for continuation, and not intended to be usefull
>>>>> + * to the caller.  It gives the number of pfns already processed (and can
>>>>> + * be skipped) in the last extent. This should always be set to zero.
>>>>>     */
>>>>>    #define XEN_DMOP_modified_memory 11
>>>>>    
>>>>>    struct xen_dm_op_modified_memory {
>>>>> +    /* IN - number of extents. */
>>>>> +    uint32_t nr_extents;
>>>>> +    /* IN - should be set to 0, and is updated on continuation. */
>>>>> +    uint32_t pfns_processed;
>>>> I'd prefer if the field (if to be kept) wasn't named after its current
>>>> purpose, nor should we state anything beyond it needing to be
>>>> zero when first invoking the call. These are implementation details
>>>> which callers should not rely on.
>>> Assuming we keep it, how about calling it "reserved", with a comment in
>>> dm.c explaining what its actually for?
>> Elsewhere we use "opaque", but "reserved" is probably fine too.
>> However, we may want to name it as an OUT value, for the
>> error-on-extent indication mentioned above (of course another
>> option would be to make nr_extent IN/OUT). As an OUT, we're
>> free to use it for whatever intermediate value, just so long as
>> upon final return to the caller it has the intended value. (It's
>> debatable whether it should be IN/OUT, due to the need for it
>> to be set to zero.)
> 
> Having an indication of which extent failed seem a sensible idea. We'd 
> need that
> parameter to be initially set to something that can represent none of 
> the extents,
> such that if there is an error before we get to precessing the extents, 
> this is clear.

I don't think this is a requirement - failure outside of any extent
processing can reasonably be attached to the first extent. The
more that the actual processing of the extent (after reading the
coordinates from guest memory) can't actually fail. With that
observation I'm in fact no longer convinced we really need such
an indication - I simply didn't pay attention to this aspect when
first suggesting it. The more that your backwards processing
would invalidate a common conclusion a caller might draw: Error
on extent N means all extents lower than N were processed
successfully.

So if you wanted to stick to providing this information I'd say
use the opaque (or whatever it's going to be named) field to
provide that status. Switch back to processing extents forwards,
having the opaque field starting out as zero point to the first
extent as the failed one initially. Initial processing during
continuation handling can't fail unless the caller has fiddled with
the hypercall arguments, so I wouldn't see anything wrong with
not providing a reliable error indicator in that case.

Jan
Jennifer Herbert March 23, 2017, 6:44 p.m. UTC | #6
On 23/03/17 15:58, Jan Beulich wrote:
>>>>>> On 22.03.17 at 20:55, <Jennifer.Herbert@citrix.com> wrote:
>>>>>> --- a/xen/arch/x86/hvm/dm.c
>>>>>> +++ b/xen/arch/x86/hvm/dm.c
>>>>>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t
>>
>>>>>> --- a/xen/include/public/hvm/dm_op.h
>>>>>> +++ b/xen/include/public/hvm/dm_op.h
>>>>>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>>>>>      * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>>>>>      *                           an emulator.
>>>>>>      *
>>>>>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>>>>>> - *       value of the pfn of the remaining set of pages and @nr reduced
>>>>>> - *       to the size of the remaining set.
>>>>>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>>>>>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>>>>>> + *  continuation.
>>>>>> + *
>>>>>> + * @pfns_processed is used for continuation, and not intended to be usefull
>>>>>> + * to the caller.  It gives the number of pfns already processed (and can
>>>>>> + * be skipped) in the last extent. This should always be set to zero.
>>>>>>      */
>>>>>>     #define XEN_DMOP_modified_memory 11
>>>>>>     
>>>>>>     struct xen_dm_op_modified_memory {
>>>>>> +    /* IN - number of extents. */
>>>>>> +    uint32_t nr_extents;
>>>>>> +    /* IN - should be set to 0, and is updated on continuation. */
>>>>>> +    uint32_t pfns_processed;
>>>>> I'd prefer if the field (if to be kept) wasn't named after its current
>>>>> purpose, nor should we state anything beyond it needing to be
>>>>> zero when first invoking the call. These are implementation details
>>>>> which callers should not rely on.
>>>> Assuming we keep it, how about calling it "reserved", with a comment in
>>>> dm.c explaining what its actually for?
>>> Elsewhere we use "opaque", but "reserved" is probably fine too.
>>> However, we may want to name it as an OUT value, for the
>>> error-on-extent indication mentioned above (of course another
>>> option would be to make nr_extent IN/OUT). As an OUT, we're
>>> free to use it for whatever intermediate value, just so long as
>>> upon final return to the caller it has the intended value. (It's
>>> debatable whether it should be IN/OUT, due to the need for it
>>> to be set to zero.)
>> Having an indication of which extent failed seem a sensible idea. We'd
>> need that
>> parameter to be initially set to something that can represent none of
>> the extents,
>> such that if there is an error before we get to precessing the extents,
>> this is clear.
> I don't think this is a requirement - failure outside of any extent
> processing can reasonably be attached to the first extent. The
> more that the actual processing of the extent (after reading the
> coordinates from guest memory) can't actually fail. With that
> observation I'm in fact no longer convinced we really need such
> an indication - I simply didn't pay attention to this aspect when
> first suggesting it. The more that your backwards processing
> would invalidate a common conclusion a caller might draw: Error
> on extent N means all extents lower than N were processed
> successfully.
>
> So if you wanted to stick to providing this information I'd say
> use the opaque (or whatever it's going to be named) field to
> provide that status. Switch back to processing extents forwards,
> having the opaque field starting out as zero point to the first
> extent as the failed one initially. Initial processing during
> continuation handling can't fail unless the caller has fiddled with
> the hypercall arguments, so I wouldn't see anything wrong with
> not providing a reliable error indicator in that case.
>

I was mostly considering it, from a debugging perspective.  Any failure 
would be due to bad
arguments, which would indicate a serious bug, and trying to continue 
after such a bug
was discovered would seem most unwise.
If I copied back the buffer on event of error, then we could state that 
nr_extent would point
to one after extent that had the error, but say it is not defined which 
if any extents would have
succeeded.  This would be a trivial change, but aid with any debugging.

I can continue to count extents backwards, saving one parameter.
I can then have an opaque, which i say nothing about, other then it 
should be zero.
This i'd use for for pfns_processed.

How does that sound?


-jenny
Jan Beulich March 24, 2017, 7:21 a.m. UTC | #7
>>> On 23.03.17 at 19:44, <Jennifer.Herbert@citrix.com> wrote:

> 
> On 23/03/17 15:58, Jan Beulich wrote:
>>>>>>> On 22.03.17 at 20:55, <Jennifer.Herbert@citrix.com> wrote:
>>>>>>> --- a/xen/arch/x86/hvm/dm.c
>>>>>>> +++ b/xen/arch/x86/hvm/dm.c
>>>>>>> @@ -119,56 +119,89 @@ static int set_isa_irq_level(struct domain *d, uint8_t
>>>
>>>>>>> --- a/xen/include/public/hvm/dm_op.h
>>>>>>> +++ b/xen/include/public/hvm/dm_op.h
>>>>>>> @@ -237,13 +237,24 @@ struct xen_dm_op_set_pci_link_route {
>>>>>>>      * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
>>>>>>>      *                           an emulator.
>>>>>>>      *
>>>>>>> - * NOTE: In the event of a continuation, the @first_pfn is set to the
>>>>>>> - *       value of the pfn of the remaining set of pages and @nr reduced
>>>>>>> - *       to the size of the remaining set.
>>>>>>> + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
>>>>>>> + * @nr_extent entries.  The value of  @nr_extent will be reduced on
>>>>>>> + *  continuation.
>>>>>>> + *
>>>>>>> + * @pfns_processed is used for continuation, and not intended to be usefull
>>>>>>> + * to the caller.  It gives the number of pfns already processed (and can
>>>>>>> + * be skipped) in the last extent. This should always be set to zero.
>>>>>>>      */
>>>>>>>     #define XEN_DMOP_modified_memory 11
>>>>>>>     
>>>>>>>     struct xen_dm_op_modified_memory {
>>>>>>> +    /* IN - number of extents. */
>>>>>>> +    uint32_t nr_extents;
>>>>>>> +    /* IN - should be set to 0, and is updated on continuation. */
>>>>>>> +    uint32_t pfns_processed;
>>>>>> I'd prefer if the field (if to be kept) wasn't named after its current
>>>>>> purpose, nor should we state anything beyond it needing to be
>>>>>> zero when first invoking the call. These are implementation details
>>>>>> which callers should not rely on.
>>>>> Assuming we keep it, how about calling it "reserved", with a comment in
>>>>> dm.c explaining what its actually for?
>>>> Elsewhere we use "opaque", but "reserved" is probably fine too.
>>>> However, we may want to name it as an OUT value, for the
>>>> error-on-extent indication mentioned above (of course another
>>>> option would be to make nr_extent IN/OUT). As an OUT, we're
>>>> free to use it for whatever intermediate value, just so long as
>>>> upon final return to the caller it has the intended value. (It's
>>>> debatable whether it should be IN/OUT, due to the need for it
>>>> to be set to zero.)
>>> Having an indication of which extent failed seem a sensible idea. We'd
>>> need that
>>> parameter to be initially set to something that can represent none of
>>> the extents,
>>> such that if there is an error before we get to precessing the extents,
>>> this is clear.
>> I don't think this is a requirement - failure outside of any extent
>> processing can reasonably be attached to the first extent. The
>> more that the actual processing of the extent (after reading the
>> coordinates from guest memory) can't actually fail. With that
>> observation I'm in fact no longer convinced we really need such
>> an indication - I simply didn't pay attention to this aspect when
>> first suggesting it. The more that your backwards processing
>> would invalidate a common conclusion a caller might draw: Error
>> on extent N means all extents lower than N were processed
>> successfully.
>>
>> So if you wanted to stick to providing this information I'd say
>> use the opaque (or whatever it's going to be named) field to
>> provide that status. Switch back to processing extents forwards,
>> having the opaque field starting out as zero point to the first
>> extent as the failed one initially. Initial processing during
>> continuation handling can't fail unless the caller has fiddled with
>> the hypercall arguments, so I wouldn't see anything wrong with
>> not providing a reliable error indicator in that case.
>>
> 
> I was mostly considering it, from a debugging perspective.  Any failure 
> would be due to bad
> arguments, which would indicate a serious bug, and trying to continue 
> after such a bug
> was discovered would seem most unwise.
> If I copied back the buffer on event of error, then we could state that 
> nr_extent would point
> to one after extent that had the error, but say it is not defined which 
> if any extents would have
> succeeded.  This would be a trivial change, but aid with any debugging.
> 
> I can continue to count extents backwards, saving one parameter.
> I can then have an opaque, which i say nothing about, other then it 
> should be zero.
> This i'd use for for pfns_processed.
> 
> How does that sound?

Reasonable. Let's see how that ends up looking.

Jan
diff mbox

Patch

diff --git a/tools/libs/devicemodel/core.c b/tools/libs/devicemodel/core.c
index a85cb49..f9e37a5 100644
--- a/tools/libs/devicemodel/core.c
+++ b/tools/libs/devicemodel/core.c
@@ -434,22 +434,36 @@  int xendevicemodel_track_dirty_vram(
                              dirty_bitmap, (size_t)(nr + 7) / 8);
 }
 
-int xendevicemodel_modified_memory(
-    xendevicemodel_handle *dmod, domid_t domid, uint64_t first_pfn,
-    uint32_t nr)
+int xendevicemodel_modified_memory_bulk(
+    xendevicemodel_handle *dmod, domid_t domid,
+    struct xen_dm_op_modified_memory_extent *extents, uint32_t nr)
 {
     struct xen_dm_op op;
-    struct xen_dm_op_modified_memory *data;
+    struct xen_dm_op_modified_memory *header;
+    size_t extents_size = nr * sizeof(struct xen_dm_op_modified_memory_extent);
 
     memset(&op, 0, sizeof(op));
 
     op.op = XEN_DMOP_modified_memory;
-    data = &op.u.modified_memory;
+    header = &op.u.modified_memory;
 
-    data->first_pfn = first_pfn;
-    data->nr = nr;
+    header->nr_extents = nr;
+    header->pfns_processed = 0;
 
-    return xendevicemodel_op(dmod, domid, 1, &op, sizeof(op));
+    return xendevicemodel_op(dmod, domid, 2, &op, sizeof(op),
+                             extents, extents_size);
+}
+
+int xendevicemodel_modified_memory(
+    xendevicemodel_handle *dmod, domid_t domid, uint64_t first_pfn,
+    uint32_t nr)
+{
+    struct xen_dm_op_modified_memory_extent extent;
+
+    extent.first_pfn = first_pfn;
+    extent.nr = nr;
+
+    return xendevicemodel_modified_memory_bulk(dmod, domid, &extent, 1);
 }
 
 int xendevicemodel_set_mem_type(
diff --git a/tools/libs/devicemodel/include/xendevicemodel.h b/tools/libs/devicemodel/include/xendevicemodel.h
index b3f600e..9c62bf9 100644
--- a/tools/libs/devicemodel/include/xendevicemodel.h
+++ b/tools/libs/devicemodel/include/xendevicemodel.h
@@ -236,8 +236,8 @@  int xendevicemodel_track_dirty_vram(
     uint32_t nr, unsigned long *dirty_bitmap);
 
 /**
- * This function notifies the hypervisor that a set of domain pages
- * have been modified.
+ * This function notifies the hypervisor that a set of contiguous
+ * domain pages have been modified.
  *
  * @parm dmod a handle to an open devicemodel interface.
  * @parm domid the domain id to be serviced
@@ -250,6 +250,21 @@  int xendevicemodel_modified_memory(
     uint32_t nr);
 
 /**
+ * This function notifies the hypervisor that a set of discontiguous
+ * domain pages have been modified.
+ *
+ * @parm dmod a handle to an open devicemodel interface.
+ * @parm domid the domain id to be serviced
+ * @parm extents an array of extent structs, which each hold
+                 a start_pfn and nr (number of pfns).
+ * @parm nr the number of extents in the array
+ * @return 0 on success, -1 on failure.
+ */
+int xendevicemodel_modified_memory_bulk(
+    xendevicemodel_handle *dmod, domid_t domid,
+    struct xen_dm_op_modified_memory_extent extents[], uint32_t nr);
+
+/**
  * This function notifies the hypervisor that a set of domain pages
  * are to be treated in a specific way. (See the definition of
  * hvmmem_type_t).
diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c
index 2122c45..c90af64 100644
--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -119,56 +119,89 @@  static int set_isa_irq_level(struct domain *d, uint8_t isa_irq,
 }
 
 static int modified_memory(struct domain *d,
-                           struct xen_dm_op_modified_memory *data)
+                           struct xen_dm_op_modified_memory *header,
+                           xen_dm_op_buf_t* buf)
 {
-    xen_pfn_t last_pfn = data->first_pfn + data->nr - 1;
-    unsigned int iter = 0;
+    /* Process maximum of 256 pfns before checking for continuation */
+    const unsigned int cont_check_interval = 0xff;
     int rc = 0;
-
-    if ( (data->first_pfn > last_pfn) ||
-         (last_pfn > domain_get_maximum_gpfn(d)) )
-        return -EINVAL;
+    unsigned int rem_extents =  header->nr_extents;
+    unsigned int batch_rem_pfns = cont_check_interval;
 
     if ( !paging_mode_log_dirty(d) )
         return 0;
 
-    while ( iter < data->nr )
+    if ( (buf->size / sizeof(struct xen_dm_op_modified_memory_extent)) <
+         header->nr_extents )
+        return -EINVAL;
+
+    while ( rem_extents > 0)
     {
-        unsigned long pfn = data->first_pfn + iter;
-        struct page_info *page;
+        struct xen_dm_op_modified_memory_extent extent;
+        unsigned int batch_nr;
+        xen_pfn_t pfn;
+        xen_pfn_t end_pfn;
 
-        page = get_page_from_gfn(d, pfn, NULL, P2M_UNSHARE);
-        if ( page )
+        if ( copy_from_guest_offset(&extent, buf->h, rem_extents - 1, 1) )
+            return -EFAULT;
+
+        pfn = extent.first_pfn + header->pfns_processed;
+        batch_nr = extent.nr - header->pfns_processed;
+
+        if ( batch_nr > batch_rem_pfns )
+        {
+           batch_nr = batch_rem_pfns;
+           header->pfns_processed += batch_nr;
+        }
+        else
         {
-            mfn_t gmfn = _mfn(page_to_mfn(page));
-
-            paging_mark_dirty(d, gmfn);
-            /*
-             * These are most probably not page tables any more
-             * don't take a long time and don't die either.
-             */
-            sh_remove_shadows(d, gmfn, 1, 0);
-            put_page(page);
+            rem_extents--;
+            header->pfns_processed = 0;
         }
 
-        iter++;
+        batch_rem_pfns -= batch_nr;
+        end_pfn = pfn + batch_nr;
+
+        if ( (pfn >= end_pfn) ||
+             (end_pfn > domain_get_maximum_gpfn(d)) )
+            return -EINVAL;
+
+        while ( pfn < end_pfn )
+        {
+            struct page_info *page;
+            page = get_page_from_gfn(d, pfn, NULL, P2M_UNSHARE);
+
+            if ( page )
+            {
+                mfn_t gmfn = _mfn(page_to_mfn(page));
+
+                paging_mark_dirty(d, gmfn);
+                /*
+                 * These are most probably not page tables any more
+                 * don't take a long time and don't die either.
+                 */
+                sh_remove_shadows(d, gmfn, 1, 0);
+                put_page(page);
+            }
+            pfn++;
+        }
 
         /*
-         * Check for continuation every 256th iteration and if the
-         * iteration is not the last.
+         * Check for continuation every 256th pfn and if the
+         * pfn is not the last.
          */
-        if ( (iter < data->nr) && ((iter & 0xff) == 0) &&
-             hypercall_preempt_check() )
+        if ( (batch_rem_pfns == 0) && (rem_extents > 0) )
         {
-            data->first_pfn += iter;
-            data->nr -= iter;
-
-            rc = -ERESTART;
-            break;
+            if ( hypercall_preempt_check() )
+            {
+                header->nr_extents = rem_extents;
+                rc = -ERESTART;
+                break;
+            }
+            batch_rem_pfns = cont_check_interval;
         }
-    }
-
-    return rc;
+   }
+   return rc;
 }
 
 static bool allow_p2m_type_change(p2m_type_t old, p2m_type_t new)
@@ -441,13 +474,8 @@  static int dm_op(domid_t domid,
         struct xen_dm_op_modified_memory *data =
             &op.u.modified_memory;
 
-        const_op = false;
-
-        rc = -EINVAL;
-        if ( data->pad )
-            break;
-
-        rc = modified_memory(d, data);
+        rc = modified_memory(d, data, &bufs[1]);
+        const_op = (rc != -ERESTART);
         break;
     }
 
diff --git a/xen/include/public/hvm/dm_op.h b/xen/include/public/hvm/dm_op.h
index f54cece..57adbf5 100644
--- a/xen/include/public/hvm/dm_op.h
+++ b/xen/include/public/hvm/dm_op.h
@@ -237,13 +237,24 @@  struct xen_dm_op_set_pci_link_route {
  * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
  *                           an emulator.
  *
- * NOTE: In the event of a continuation, the @first_pfn is set to the
- *       value of the pfn of the remaining set of pages and @nr reduced
- *       to the size of the remaining set.
+ * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
+ * @nr_extent entries.  The value of  @nr_extent will be reduced on
+ *  continuation.
+ *
+ * @pfns_processed is used for continuation, and not intended to be usefull
+ * to the caller.  It gives the number of pfns already processed (and can
+ * be skipped) in the last extent. This should always be set to zero.
  */
 #define XEN_DMOP_modified_memory 11
 
 struct xen_dm_op_modified_memory {
+    /* IN - number of extents. */
+    uint32_t nr_extents;
+    /* IN - should be set to 0, and is updated on continuation. */
+    uint32_t pfns_processed;
+};
+
+struct xen_dm_op_modified_memory_extent {
     /* IN - number of contiguous pages modified */
     uint32_t nr;
     uint32_t pad;