diff mbox series

[v2,07/10] vm_event: Add vm_event_ng interface

Message ID 79a1e2aebc55c20f58cb8c925320de202b17d8f2.1563293545.git.ppircalabu@bitdefender.com (mailing list archive)
State New, archived
Headers show
Series Per vcpu vm_event channels | expand

Commit Message

Petre Ovidiu PIRCALABU July 16, 2019, 5:06 p.m. UTC
In high throughput introspection scenarios where lots of monitor
vm_events are generated, the ring buffer can fill up before the monitor
application gets a chance to handle all the requests thus blocking
other vcpus which will have to wait for a slot to become available.

This patch adds support for a different mechanism to handle synchronous
vm_event requests / responses. As each synchronous request pauses the
vcpu until the corresponding response is handled, it can be stored in
a slotted memory buffer (one per vcpu) shared between the hypervisor and
the controlling domain.

Signed-off-by: Petre Pircalabu <ppircalabu@bitdefender.com>
---
 tools/libxc/include/xenctrl.h |   9 +
 tools/libxc/xc_mem_paging.c   |   9 +-
 tools/libxc/xc_memshr.c       |   9 +-
 tools/libxc/xc_monitor.c      |  23 +-
 tools/libxc/xc_private.h      |  12 +-
 tools/libxc/xc_vm_event.c     | 100 ++++++-
 xen/arch/x86/mm.c             |   7 +
 xen/common/vm_event.c         | 595 +++++++++++++++++++++++++++++++++++-------
 xen/include/public/domctl.h   |  10 +-
 xen/include/public/memory.h   |   2 +
 xen/include/public/vm_event.h |  16 ++
 xen/include/xen/vm_event.h    |  11 +-
 12 files changed, 684 insertions(+), 119 deletions(-)

Comments

Tamas K Lengyel July 16, 2019, 9:13 p.m. UTC | #1
On Tue, Jul 16, 2019 at 11:06 AM Petre Pircalabu
<ppircalabu@bitdefender.com> wrote:
>
> In high throughput introspection scenarios where lots of monitor
> vm_events are generated, the ring buffer can fill up before the monitor
> application gets a chance to handle all the requests thus blocking
> other vcpus which will have to wait for a slot to become available.
>
> This patch adds support for a different mechanism to handle synchronous
> vm_event requests / responses. As each synchronous request pauses the
> vcpu until the corresponding response is handled, it can be stored in
> a slotted memory buffer (one per vcpu) shared between the hypervisor and
> the controlling domain.
>
> Signed-off-by: Petre Pircalabu <ppircalabu@bitdefender.com>

So this is quite a large patch, perhaps it would be better to split it
into a hypervisor-side patch and then a toolstack-side one. Also,
would it make sense to keep the two implementations in separate files
within Xen (ie. common/vm_event.c and vm_event_ng.c)?

Tamas
Jan Beulich July 17, 2019, 10:06 a.m. UTC | #2
On 16.07.2019 19:06, Petre Pircalabu wrote:
> +static void vm_event_channels_free_buffer(struct vm_event_channels_domain *impl)
>   {
> -    vm_event_ring_resume(to_ring(v->domain->vm_event_monitor));
> +    int i;
> +
> +    vunmap(impl->slots);
> +    impl->slots = NULL;
> +
> +    for ( i = 0; i < impl->nr_frames; i++ )
> +        free_domheap_page(mfn_to_page(impl->mfn[i]));
>   }

What guarantees that there are no mappings left of the pages you free
here? Sharing pages with guests is (so far) an (almost) irreversible
action, i.e. they may generally only be freed upon domain destruction.
See gnttab_unpopulate_status_frames() for a case where we actually
make an attempt at freeing such pages (but where we fail the request
in case references are left in place).

Furthermore, is there any guarantee that the pages you free here
were actually allocated? ->nr_frames gets set ahead of the actual
allocation.

> +int vm_event_ng_get_frames(struct domain *d, unsigned int id,
> +                           unsigned long frame, unsigned int nr_frames,
> +                           xen_pfn_t mfn_list[])
> +{
> +    struct vm_event_domain *ved;
> +    int i;
> +
> +    switch (id )
> +    {
> +    case XEN_VM_EVENT_TYPE_MONITOR:
> +        ved = d->vm_event_monitor;
> +        break;
> +
> +    default:
> +        return -ENOSYS;

Various other error codes might be fine here, but ENOSYS is not
(despite pre-existing misuse elsewhere in the tree).

> +    }
> +
> +    if ( !vm_event_check(ved) )
> +        return -EINVAL;
> +
> +    if ( frame != 0 || nr_frames != to_channels(ved)->nr_frames )
> +        return -EINVAL;

Is there a particular reason for this all-or-nothing model?

> +    spin_lock(&ved->lock);
> +
> +    for ( i = 0; i < to_channels(ved)->nr_frames; i++ )
> +        mfn_list[i] = mfn_x(to_channels(ved)->mfn[i]);
> +
> +    spin_unlock(&ved->lock);

What is the locking good for here? You obviously can't be afraid of
the values becoming stale, as they surely would be by the time the
caller gets to see them (if they can go stale in the first place).

> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -38,7 +38,7 @@
>   #include "hvm/save.h"
>   #include "memory.h"
>   
> -#define XEN_DOMCTL_INTERFACE_VERSION 0x00000011
> +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000012

This looks to be needed only because of ...

> @@ -781,12 +781,20 @@ struct xen_domctl_gdbsx_domstatus {
>   struct xen_domctl_vm_event_op {
>       uint32_t       op;           /* XEN_VM_EVENT_* */
>       uint32_t       type;         /* XEN_VM_EVENT_TYPE_* */
> + /* Use the NG interface */
> +#define _XEN_VM_EVENT_FLAGS_NG_OP         0
> +#define XEN_VM_EVENT_FLAGS_NG_OP          (1U << _XEN_VM_EVENT_FLAGS_NG_OP)
> +    uint32_t       flags;

... this addition. Is the new field really warranted here? Can't
you e.g. simply define a new set of ops (e.g. by setting their
high bits)?

I've omitted all style comments (formatting, plain vs unsigned int
etc) - I'd like to leave that to the VM event maintainers.

Jan
Petre Ovidiu PIRCALABU July 17, 2019, 12:13 p.m. UTC | #3
On Tue, 2019-07-16 at 15:13 -0600, Tamas K Lengyel wrote:
> On Tue, Jul 16, 2019 at 11:06 AM Petre Pircalabu
> <ppircalabu@bitdefender.com> wrote:
> > 
> > In high throughput introspection scenarios where lots of monitor
> > vm_events are generated, the ring buffer can fill up before the
> > monitor
> > application gets a chance to handle all the requests thus blocking
> > other vcpus which will have to wait for a slot to become available.
> > 
> > This patch adds support for a different mechanism to handle
> > synchronous
> > vm_event requests / responses. As each synchronous request pauses
> > the
> > vcpu until the corresponding response is handled, it can be stored
> > in
> > a slotted memory buffer (one per vcpu) shared between the
> > hypervisor and
> > the controlling domain.
> > 
> > Signed-off-by: Petre Pircalabu <ppircalabu@bitdefender.com>
> 
> So this is quite a large patch, perhaps it would be better to split
> it
> into a hypervisor-side patch and then a toolstack-side one. Also,
> would it make sense to keep the two implementations in separate files
> within Xen (ie. common/vm_event.c and vm_event_ng.c)?
> 
> Tamas
I thought about having 2 separate patches, one for hypervisor and one
for libxc, but my main concern was not to break "git bisect"(I'm not
sure if there are any tests (other than the build) which need to pass).
The "flags" field was added to vm_event_op, and, if it's not manually
set by the caller, an invalid value might be passed to XEN (e.g.
xc_vm_event_control).

Initially the new interface was contained in a separate file (has it's
own domctl) but I've copied it to vm_event because I wanted to have all
non-interface functions static. (e.g. the "enable" functions for both
transports and vm_event_handle_response). However, I will look into
this again, and, if I can find a clean way to minimize the
dependencies, I will split it again.

Many thanks,
Petre
Tamas K Lengyel July 17, 2019, 12:38 p.m. UTC | #4
> I've omitted all style comments (formatting, plain vs unsigned int
> etc) - I'd like to leave that to the VM event maintainers.

Do we have an automated way to check for style issues, like astyle? In
my projects I integrate that into my Travis checks so I don't have to
do that manually (see
https://github.com/tklengyel/drakvuf/blob/master/.travis.yml#L28).
Checking for that kind-of stuff manually is far from ideal.

Tamas
Jan Beulich July 17, 2019, 1:12 p.m. UTC | #5
On 17.07.2019 14:38, Tamas K Lengyel wrote:
>> I've omitted all style comments (formatting, plain vs unsigned int
>> etc) - I'd like to leave that to the VM event maintainers.
> 
> Do we have an automated way to check for style issues, like astyle?

We don't; there is some work underway towards that, afaia. But to
be honest, people should be looking at their code/patches before
submitting. Then it at least would typically only be very few issues
for reviewers to point out. But there were quite a few here.

Jan
Alexandru Stefan ISAILA July 17, 2019, 1:42 p.m. UTC | #6
> +
> +out:
> +    rc2 = xc_domain_unpause(xch, domain_id);
> +    if ( rc1 || rc2 )
> +    {
> +        if ( rc2 )
> +            PERROR("Unable to pause domain\n");
> +
> +        if ( rc1 == 0 )
> +            rc1 = rc2;

You can use !rc1 here.

> +    }
> +
> +    return rc1;
> +}
> +
> +int xc_vm_event_ng_disable(xc_interface *xch, uint32_t domain_id, int type,
> +                           xenforeignmemory_resource_handle **fres)
> +{
> +    xenforeignmemory_unmap_resource(xch->fmem, *fres);
> +    *fres = NULL;
> +
> +    return xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_DISABLE,
> +                              type, XEN_VM_EVENT_FLAGS_NG_OP, NULL);
> +}
> +



>   
> +static int vm_event_ring_pfn_param(uint32_t type)
> +{
> +    switch( type )
> +    {
> +#ifdef CONFIG_HAS_MEM_PAGING
> +    case XEN_VM_EVENT_TYPE_PAGING:
> +        return HVM_PARAM_PAGING_RING_PFN;
> +#endif
> +    case XEN_VM_EVENT_TYPE_MONITOR:
> +        return HVM_PARAM_MONITOR_RING_PFN;
> +#ifdef CONFIG_HAS_MEM_SHARING
> +    case XEN_VM_EVENT_TYPE_SHARING:
> +        return HVM_PARAM_SHARING_RING_PFN;
> +#endif
> +    };
> +
> +    ASSERT_UNREACHABLE();
> +    return -1;

Blank line before final return...

> +}
> +
> +static int vm_event_pause_flag(uint32_t type)
> +{
> +    switch( type )
> +    {
> +#ifdef CONFIG_HAS_MEM_PAGING
> +    case XEN_VM_EVENT_TYPE_PAGING:
> +        return _VPF_mem_paging;
> +#endif
> +    case XEN_VM_EVENT_TYPE_MONITOR:
> +        return _VPF_mem_access;
> +#ifdef CONFIG_HAS_MEM_SHARING
> +    case XEN_VM_EVENT_TYPE_SHARING:
> +        return _VPF_mem_sharing;
> +#endif
> +    };
> +
> +    ASSERT_UNREACHABLE();
> +    return -1;

here

> +}
> +
> +#ifdef CONFIG_HAS_MEM_PAGING
> +static void mem_paging_notification(struct vcpu *v, unsigned int port);
> +#endif
> +static void monitor_notification(struct vcpu *v, unsigned int port);
> +#ifdef CONFIG_HAS_MEM_SHARING
> +static void mem_sharing_notification(struct vcpu *v, unsigned int port);
> +#endif
> +
> +static xen_event_channel_notification_t vm_event_notification_fn(uint32_t type)
> +{
> +    switch( type )
> +    {
> +#ifdef CONFIG_HAS_MEM_PAGING
> +    case XEN_VM_EVENT_TYPE_PAGING:
> +        return mem_paging_notification;
> +#endif
> +    case XEN_VM_EVENT_TYPE_MONITOR:
> +        return monitor_notification;
> +#ifdef CONFIG_HAS_MEM_SHARING
> +    case XEN_VM_EVENT_TYPE_SHARING:
> +        return mem_sharing_notification;
> +#endif
> +    };
> +
> +    ASSERT_UNREACHABLE();
> +    return NULL;

and here

> +}
> +
> +/*
> + * VM event ring implementation;
> + */

Alex
Petre Ovidiu PIRCALABU July 17, 2019, 2:41 p.m. UTC | #7
On Wed, 2019-07-17 at 10:06 +0000, Jan Beulich wrote:
> On 16.07.2019 19:06, Petre Pircalabu wrote:
> > +static void vm_event_channels_free_buffer(struct
> > vm_event_channels_domain *impl)
> >   {
> > -    vm_event_ring_resume(to_ring(v->domain->vm_event_monitor));
> > +    int i;
> > +
> > +    vunmap(impl->slots);
> > +    impl->slots = NULL;
> > +
> > +    for ( i = 0; i < impl->nr_frames; i++ )
> > +        free_domheap_page(mfn_to_page(impl->mfn[i]));
> >   }
> 
> What guarantees that there are no mappings left of the pages you free
> here? Sharing pages with guests is (so far) an (almost) irreversible
> action, i.e. they may generally only be freed upon domain
> destruction.
> See gnttab_unpopulate_status_frames() for a case where we actually
> make an attempt at freeing such pages (but where we fail the request
> in case references are left in place).
> 
I've tested manually 2 cases and they both work (no crashes):
1: introspected domain starts -> monitor starts -> monitor stops ->
domain stops
2: introspected domain starts -> monitor starts -> domain stops ->
monitor stops.
However, I will take a closer look at gnttab_unpopulate_status_frames
and post a follow up email.

> Furthermore, is there any guarantee that the pages you free here
> were actually allocated? ->nr_frames gets set ahead of the actual
> allocation.
> 
vm_event_channels_free_buffer is called only from
vm_event_channels_disable. The latter is called only if vm_event_check
succeeds ( vm_event_cleanup and vm_event_domctl/VM_EVENT_DISABLE).
vm_event_check will only return true if vm_event_enable succeeds.

> > +int vm_event_ng_get_frames(struct domain *d, unsigned int id,
> > +                           unsigned long frame, unsigned int
> > nr_frames,
> > +                           xen_pfn_t mfn_list[])
> > +{
> > +    struct vm_event_domain *ved;
> > +    int i;
> > +
> > +    switch (id )
> > +    {
> > +    case XEN_VM_EVENT_TYPE_MONITOR:
> > +        ved = d->vm_event_monitor;
> > +        break;
> > +
> > +    default:
> > +        return -ENOSYS;
> 
> Various other error codes might be fine here, but ENOSYS is not
> (despite pre-existing misuse elsewhere in the tree).

vm_event_domctl also returns -ENOSYS if the type is neither
XEN_VM_EVENT_TYPE_PAGING, XEN_VM_EVENT_TYPE_MONITOR, nor
XEN_VM_EVENT_TYPE_SHARING. I've just followed the existing convention.

> 
> > +    }
> > +
> > +    if ( !vm_event_check(ved) )
> > +        return -EINVAL;
> > +
> > +    if ( frame != 0 || nr_frames != to_channels(ved)->nr_frames )
> > +        return -EINVAL;
> 
> Is there a particular reason for this all-or-nothing model?

I've added this extra check due to the way acquire_resource interface
is designed. In our case, the memory is allocated from
XEN_VM_EVENT_ENABLE and the size is known beforehand: the number of
pages needed to store (vcpus_count * sizeof vm_event_slot) bytes.
However the acquire_resource needs a "nr_frames" parameter which is
computed in a similar manner in the libxc wrapper. 
This check only verifies that userspace is not sending an invalid
parameter (using an ASSERT in this case would have been overkill
because it would crash the whole hypervisor)

> 
> > +    spin_lock(&ved->lock);
> > +
> > +    for ( i = 0; i < to_channels(ved)->nr_frames; i++ )
> > +        mfn_list[i] = mfn_x(to_channels(ved)->mfn[i]);
> > +
> > +    spin_unlock(&ved->lock);
> 
> What is the locking good for here? You obviously can't be afraid of
> the values becoming stale, as they surely would be by the time the
> caller gets to see them (if they can go stale in the first place).
Thanks for pointing this out. I will remove the lock in the next
patchset iteration.
> 
> > --- a/xen/include/public/domctl.h
> > +++ b/xen/include/public/domctl.h
> > @@ -38,7 +38,7 @@
> >   #include "hvm/save.h"
> >   #include "memory.h"
> >   
> > -#define XEN_DOMCTL_INTERFACE_VERSION 0x00000011
> > +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000012
> 
> This looks to be needed only because of ...
> 
> > @@ -781,12 +781,20 @@ struct xen_domctl_gdbsx_domstatus {
> >   struct xen_domctl_vm_event_op {
> >       uint32_t       op;           /* XEN_VM_EVENT_* */
> >       uint32_t       type;         /* XEN_VM_EVENT_TYPE_* */
> > + /* Use the NG interface */
> > +#define _XEN_VM_EVENT_FLAGS_NG_OP         0
> > +#define XEN_VM_EVENT_FLAGS_NG_OP          (1U <<
> > _XEN_VM_EVENT_FLAGS_NG_OP)
> > +    uint32_t       flags;
> 
> ... this addition. Is the new field really warranted here? Can't
> you e.g. simply define a new set of ops (e.g. by setting their
> high bits)?
> 
You are right. Actually only a new op is needed (e.g.
XEN_VM_EVENT_NG_ENABLE).
The only needed adition is the vcpu_id for the resume op, in order to
specify the vcpu which will handle the request in case of the
"channels" vm_event transport. However, this will not affect the
domctl's offsets hence the interface version increment is not required.
I will change this in the next patchset iteration.

> I've omitted all style comments (formatting, plain vs unsigned int
> etc) - I'd like to leave that to the VM event maintainers.
I will check again the patch for style errors, but in the meantime, if
something stands out, please let me know and I will fix it asap.
> 
> Jan

Many thanks for your support,
Petre
Petre Ovidiu PIRCALABU July 17, 2019, 2:46 p.m. UTC | #8
On Wed, 2019-07-17 at 16:42 +0300, Alexandru Stefan ISAILA wrote:
> > +
> > +out:
> > +    rc2 = xc_domain_unpause(xch, domain_id);
> > +    if ( rc1 || rc2 )
> > +    {
> > +        if ( rc2 )
> > +            PERROR("Unable to pause domain\n");
> > +
> > +        if ( rc1 == 0 )
> > +            rc1 = rc2;
> 
> You can use !rc1 here.
> 
> > +    }
> > +
> > +    return rc1;
> > +}
> > +
> > +int xc_vm_event_ng_disable(xc_interface *xch, uint32_t domain_id,
> > int type,
> > +                           xenforeignmemory_resource_handle
> > **fres)
> > +{
> > +    xenforeignmemory_unmap_resource(xch->fmem, *fres);
> > +    *fres = NULL;
> > +
> > +    return xc_vm_event_control(xch, domain_id,
> > XEN_VM_EVENT_DISABLE,
> > +                              type, XEN_VM_EVENT_FLAGS_NG_OP,
> > NULL);
> > +}
> > +
> 
> 
> 
> >   
> > +static int vm_event_ring_pfn_param(uint32_t type)
> > +{
> > +    switch( type )
> > +    {
> > +#ifdef CONFIG_HAS_MEM_PAGING
> > +    case XEN_VM_EVENT_TYPE_PAGING:
> > +        return HVM_PARAM_PAGING_RING_PFN;
> > +#endif
> > +    case XEN_VM_EVENT_TYPE_MONITOR:
> > +        return HVM_PARAM_MONITOR_RING_PFN;
> > +#ifdef CONFIG_HAS_MEM_SHARING
> > +    case XEN_VM_EVENT_TYPE_SHARING:
> > +        return HVM_PARAM_SHARING_RING_PFN;
> > +#endif
> > +    };
> > +
> > +    ASSERT_UNREACHABLE();
> > +    return -1;
> 
> Blank line before final return...
> 
> > +}
> > +
> > +static int vm_event_pause_flag(uint32_t type)
> > +{
> > +    switch( type )
> > +    {
> > +#ifdef CONFIG_HAS_MEM_PAGING
> > +    case XEN_VM_EVENT_TYPE_PAGING:
> > +        return _VPF_mem_paging;
> > +#endif
> > +    case XEN_VM_EVENT_TYPE_MONITOR:
> > +        return _VPF_mem_access;
> > +#ifdef CONFIG_HAS_MEM_SHARING
> > +    case XEN_VM_EVENT_TYPE_SHARING:
> > +        return _VPF_mem_sharing;
> > +#endif
> > +    };
> > +
> > +    ASSERT_UNREACHABLE();
> > +    return -1;
> 
> here
> 
> > +}
> > +
> > +#ifdef CONFIG_HAS_MEM_PAGING
> > +static void mem_paging_notification(struct vcpu *v, unsigned int
> > port);
> > +#endif
> > +static void monitor_notification(struct vcpu *v, unsigned int
> > port);
> > +#ifdef CONFIG_HAS_MEM_SHARING
> > +static void mem_sharing_notification(struct vcpu *v, unsigned int
> > port);
> > +#endif
> > +
> > +static xen_event_channel_notification_t
> > vm_event_notification_fn(uint32_t type)
> > +{
> > +    switch( type )
> > +    {
> > +#ifdef CONFIG_HAS_MEM_PAGING
> > +    case XEN_VM_EVENT_TYPE_PAGING:
> > +        return mem_paging_notification;
> > +#endif
> > +    case XEN_VM_EVENT_TYPE_MONITOR:
> > +        return monitor_notification;
> > +#ifdef CONFIG_HAS_MEM_SHARING
> > +    case XEN_VM_EVENT_TYPE_SHARING:
> > +        return mem_sharing_notification;
> > +#endif
> > +    };
> > +
> > +    ASSERT_UNREACHABLE();
> > +    return NULL;
> 
> and here
> 
> > +}
> > +
> > +/*
> > + * VM event ring implementation;
> > + */
> 
> Alex
Thanks for noticing these. I will fix them in the next patch iteration.

Petre
Jan Beulich July 17, 2019, 4:32 p.m. UTC | #9
On 17.07.2019 16:41, Petre Ovidiu PIRCALABU wrote:
> On Wed, 2019-07-17 at 10:06 +0000, Jan Beulich wrote:
>> On 16.07.2019 19:06, Petre Pircalabu wrote:
>>> +static void vm_event_channels_free_buffer(struct
>>> vm_event_channels_domain *impl)
>>>    {
>>> -    vm_event_ring_resume(to_ring(v->domain->vm_event_monitor));
>>> +    int i;
>>> +
>>> +    vunmap(impl->slots);
>>> +    impl->slots = NULL;
>>> +
>>> +    for ( i = 0; i < impl->nr_frames; i++ )
>>> +        free_domheap_page(mfn_to_page(impl->mfn[i]));
>>>    }
>>
>> What guarantees that there are no mappings left of the pages you free
>> here? Sharing pages with guests is (so far) an (almost) irreversible
>> action, i.e. they may generally only be freed upon domain
>> destruction.
>> See gnttab_unpopulate_status_frames() for a case where we actually
>> make an attempt at freeing such pages (but where we fail the request
>> in case references are left in place).
>>
> I've tested manually 2 cases and they both work (no crashes):
> 1: introspected domain starts -> monitor starts -> monitor stops ->
> domain stops
> 2: introspected domain starts -> monitor starts -> domain stops ->
> monitor stops.

Well, testing is important, but doing tests like you describe won't
catch any misbehaving or malicious monitor domain.

> However, I will take a closer look at gnttab_unpopulate_status_frames
> and post a follow up email.

Thanks.

>> Furthermore, is there any guarantee that the pages you free here
>> were actually allocated? ->nr_frames gets set ahead of the actual
>> allocation.
>>
> vm_event_channels_free_buffer is called only from
> vm_event_channels_disable. The latter is called only if vm_event_check
> succeeds ( vm_event_cleanup and vm_event_domctl/VM_EVENT_DISABLE).
> vm_event_check will only return true if vm_event_enable succeeds.

Hmm, looks like I was mislead to believe the freeing function
would be called by vm_event_channels_enable() not itself freeing
what it might have allocated upon error. So perhaps the bug is
there, not where I thought it would be.

>>> +int vm_event_ng_get_frames(struct domain *d, unsigned int id,
>>> +                           unsigned long frame, unsigned int
>>> nr_frames,
>>> +                           xen_pfn_t mfn_list[])
>>> +{
>>> +    struct vm_event_domain *ved;
>>> +    int i;
>>> +
>>> +    switch (id )
>>> +    {
>>> +    case XEN_VM_EVENT_TYPE_MONITOR:
>>> +        ved = d->vm_event_monitor;
>>> +        break;
>>> +
>>> +    default:
>>> +        return -ENOSYS;
>>
>> Various other error codes might be fine here, but ENOSYS is not
>> (despite pre-existing misuse elsewhere in the tree).
> 
> vm_event_domctl also returns -ENOSYS if the type is neither
> XEN_VM_EVENT_TYPE_PAGING, XEN_VM_EVENT_TYPE_MONITOR, nor
> XEN_VM_EVENT_TYPE_SHARING. I've just followed the existing convention.

Right - that's one of the pre-existing misuses that I was referring
to.

>>> +    }
>>> +
>>> +    if ( !vm_event_check(ved) )
>>> +        return -EINVAL;
>>> +
>>> +    if ( frame != 0 || nr_frames != to_channels(ved)->nr_frames )
>>> +        return -EINVAL;
>>
>> Is there a particular reason for this all-or-nothing model?
> 
> I've added this extra check due to the way acquire_resource interface
> is designed. In our case, the memory is allocated from
> XEN_VM_EVENT_ENABLE and the size is known beforehand: the number of
> pages needed to store (vcpus_count * sizeof vm_event_slot) bytes.
> However the acquire_resource needs a "nr_frames" parameter which is
> computed in a similar manner in the libxc wrapper.

Hmm, maybe I'm not up to date here: Paul, is the general resource
obtaining/mapping logic indeed meant to be an all-or-nothing thing
only?

Jan
Paul Durrant July 17, 2019, 6:42 p.m. UTC | #10
> -----Original Message-----
[snip]
> >>> +    }
> >>> +
> >>> +    if ( !vm_event_check(ved) )
> >>> +        return -EINVAL;
> >>> +
> >>> +    if ( frame != 0 || nr_frames != to_channels(ved)->nr_frames )
> >>> +        return -EINVAL;
> >>
> >> Is there a particular reason for this all-or-nothing model?
> >
> > I've added this extra check due to the way acquire_resource interface
> > is designed. In our case, the memory is allocated from
> > XEN_VM_EVENT_ENABLE and the size is known beforehand: the number of
> > pages needed to store (vcpus_count * sizeof vm_event_slot) bytes.
> > However the acquire_resource needs a "nr_frames" parameter which is
> > computed in a similar manner in the libxc wrapper.
> 
> Hmm, maybe I'm not up to date here: Paul, is the general resource
> obtaining/mapping logic indeed meant to be an all-or-nothing thing
> only?
> 

Not really, no. The intention is that any subsection of the resource space may be mapped, so as frame + nr_frames doesn't exceed the size of the space then there should be no reason to return an error.

  Paul

> Jan
Petre Ovidiu PIRCALABU July 18, 2019, 1:59 p.m. UTC | #11
On Wed, 2019-07-17 at 16:32 +0000, Jan Beulich wrote:
> On 17.07.2019 16:41, Petre Ovidiu PIRCALABU wrote:
> > On Wed, 2019-07-17 at 10:06 +0000, Jan Beulich wrote:
> > > On 16.07.2019 19:06, Petre Pircalabu wrote:
> > > > +static void vm_event_channels_free_buffer(struct
> > > > vm_event_channels_domain *impl)
> > > >    {
> > > > -    vm_event_ring_resume(to_ring(v->domain-
> > > > >vm_event_monitor));
> > > > +    int i;
> > > > +
> > > > +    vunmap(impl->slots);
> > > > +    impl->slots = NULL;
> > > > +
> > > > +    for ( i = 0; i < impl->nr_frames; i++ )
> > > > +        free_domheap_page(mfn_to_page(impl->mfn[i]));
> > > >    }
> > > 
> > > What guarantees that there are no mappings left of the pages you
> > > free
> > > here? Sharing pages with guests is (so far) an (almost)
> > > irreversible
> > > action, i.e. they may generally only be freed upon domain
> > > destruction.
> > > See gnttab_unpopulate_status_frames() for a case where we
> > > actually
> > > make an attempt at freeing such pages (but where we fail the
> > > request
> > > in case references are left in place).
> > > 
> > 
> > I've tested manually 2 cases and they both work (no crashes):
> > 1: introspected domain starts -> monitor starts -> monitor stops ->
> > domain stops
> > 2: introspected domain starts -> monitor starts -> domain stops ->
> > monitor stops.
> 
> Well, testing is important, but doing tests like you describe won't
> catch any misbehaving or malicious monitor domain.
> 
> > However, I will take a closer look at
> > gnttab_unpopulate_status_frames
> > and post a follow up email.
> 
> Thanks.
> 
Hi Jan,

Could you help me clarify some things? Maybe am approaching the whole
problem incorrectly.

To help explaining things a little better I will use the following
abbreviations:
ID - introspected domain (the domain for which the vm_event requests
are generated)
MD - monitor domain (the domain which handles the requests and posts
the responses)

The legacy approach (ring) is to have a dedicated gfn in ID (ring
page), which is mapped by XEN using __map_domain_page_global and then
MD use xc_map_foreign_pages to create the mapping and
xc_domain_decrease_reservation_exact to remove the page from ID's
physmap.
The are a number of problems with this approach, the most impactfull
being that guests with a high number of vcpus will fills-up the ring
quite quicly. This and the fact vm_event_request size increases as
monitor applications become more complex incur idle times for vcpus
waiting to post a request. 

To alleviate this problem I need to have a number of frames shared
between the hypervisor and MD. The ID doesn't need to know about those
frames because it will never access this memory area (unlike ioreq who
intercepts the access to certain addresses).

Before using xenforeignmemory_map_resource I investigated several
different approaches:
- Allocate the memory in hypervisor and xc_map_foreign_pages (doesn't 
work because you cannot "foreignmap" pages of your own domain.
- Allocate the memory in guest, and map the allocated pages in XEN. To
my knowledge there is no such API in linux to do this and the monitor
application, as an userspace program, is not aware of the actual gfns
for an allocated memory area.

So, at this point the most promising solution is allocating the memory
in XEN, sharing it with ID using share_xen_page_with_guest, and mapping
it with xenforeignmemory_map_resource (with the flag
XENMEM_rsrc_acq_caller_owned set)

To my understanding the cleanup sequence from
gnttab_unpopulate_status_frames basically boils down to:
1. guest_physmap_remove_page 
2. if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) )
       put_page(pg);
3. free_xenheap_page

My current implementation uses vzalloc instead of alloc_xenheap_pages
and that's why I assumed vunmap and free_domheap_pages will suffice (I
would have called vfree directly, but the temporary linked list that is
used to hold the page references causes free_domheap_pages to crash)

Do I also have to call guest_physmap_remove_page and put_page? (steps
1. and 2.)

Many thanks for your support,
Petre
Jan Beulich July 18, 2019, 2:44 p.m. UTC | #12
On 18.07.2019 15:59, Petre Ovidiu PIRCALABU wrote:
> Before using xenforeignmemory_map_resource I investigated several
> different approaches:
> - Allocate the memory in hypervisor and xc_map_foreign_pages (doesn't
> work because you cannot "foreignmap" pages of your own domain.
> - Allocate the memory in guest, and map the allocated pages in XEN. To
> my knowledge there is no such API in linux to do this and the monitor
> application, as an userspace program, is not aware of the actual gfns
> for an allocated memory area.
> 
> So, at this point the most promising solution is allocating the memory
> in XEN, sharing it with ID using share_xen_page_with_guest, and mapping
> it with xenforeignmemory_map_resource (with the flag
> XENMEM_rsrc_acq_caller_owned set)

Which is fine - that's why Paul had introduced the generic interface.

> To my understanding the cleanup sequence from
> gnttab_unpopulate_status_frames basically boils down to:
> 1. guest_physmap_remove_page
> 2. if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) )
>         put_page(pg);
> 3. free_xenheap_page

You're missing the crucial part of undoing step 2 if you find
that there are still references left to the page.

And then, because of your use of vzalloc(), you can't use
free_xenheap_pages(), as that takes a (direct map) linear address
as input. It has to be free_domheap_pages() in your case.

> My current implementation uses vzalloc instead of alloc_xenheap_pages
> and that's why I assumed vunmap and free_domheap_pages will suffice (I
> would have called vfree directly, but the temporary linked list that is
> used to hold the page references causes free_domheap_pages to crash)
> 
> Do I also have to call guest_physmap_remove_page and put_page? (steps
> 1. and 2.)

guest_physmap_remove_page() needs to be called for translated page-
owning domains if the page was ever added to their physmap. As long
as you avoid adding, you also don't need to remove. I don't recall
though whether a translated domain can access a resource without it
having a representation in its GFN space.

You definitely need to do step 2 (which is to undo the respective
part of share_xen_page_with_guest(), and you _also_ need to clear
the page owner (undoing the other part of what that function has
done). And then, as said above, you need to check that the page
has no references left on it, and correctly deal with the case when
it still has some.

On the whole gnttab_unpopulate_status_frames() is very unfortunate
to have the way it is, including the various domain_crash()-es. It
was merely the only way we could see when dealing with XSA-255. I
would strongly recommend against new code trying to go this same
route, unless we introduce a generic and clean/safe function
inverting share_xen_page_with_guest() (which I've tried in the past
and failed).

Jan
Petre Ovidiu PIRCALABU July 18, 2019, 2:55 p.m. UTC | #13
On Thu, 2019-07-18 at 14:44 +0000, Jan Beulich wrote:
> On 18.07.2019 15:59, Petre Ovidiu PIRCALABU wrote:
> > Before using xenforeignmemory_map_resource I investigated several
> > different approaches:
> > - Allocate the memory in hypervisor and xc_map_foreign_pages
> > (doesn't
> > work because you cannot "foreignmap" pages of your own domain.
> > - Allocate the memory in guest, and map the allocated pages in XEN.
> > To
> > my knowledge there is no such API in linux to do this and the
> > monitor
> > application, as an userspace program, is not aware of the actual
> > gfns
> > for an allocated memory area.
> > 
> > So, at this point the most promising solution is allocating the
> > memory
> > in XEN, sharing it with ID using share_xen_page_with_guest, and
> > mapping
> > it with xenforeignmemory_map_resource (with the flag
> > XENMEM_rsrc_acq_caller_owned set)
> 
> Which is fine - that's why Paul had introduced the generic interface.
> 
> > To my understanding the cleanup sequence from
> > gnttab_unpopulate_status_frames basically boils down to:
> > 1. guest_physmap_remove_page
> > 2. if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) )
> >         put_page(pg);
> > 3. free_xenheap_page
> 
> You're missing the crucial part of undoing step 2 if you find
> that there are still references left to the page.
> 
> And then, because of your use of vzalloc(), you can't use
> free_xenheap_pages(), as that takes a (direct map) linear address
> as input. It has to be free_domheap_pages() in your case.
> 
> > My current implementation uses vzalloc instead of
> > alloc_xenheap_pages
> > and that's why I assumed vunmap and free_domheap_pages will suffice
> > (I
> > would have called vfree directly, but the temporary linked list
> > that is
> > used to hold the page references causes free_domheap_pages to
> > crash)
> > 
> > Do I also have to call guest_physmap_remove_page and put_page?
> > (steps
> > 1. and 2.)
> 
> guest_physmap_remove_page() needs to be called for translated page-
> owning domains if the page was ever added to their physmap. As long
> as you avoid adding, you also don't need to remove. I don't recall
> though whether a translated domain can access a resource without it
> having a representation in its GFN space.
> 
> You definitely need to do step 2 (which is to undo the respective
> part of share_xen_page_with_guest(), and you _also_ need to clear
> the page owner (undoing the other part of what that function has
> done). And then, as said above, you need to check that the page
> has no references left on it, and correctly deal with the case when
> it still has some.
> 
> On the whole gnttab_unpopulate_status_frames() is very unfortunate
> to have the way it is, including the various domain_crash()-es. It
> was merely the only way we could see when dealing with XSA-255. I
> would strongly recommend against new code trying to go this same
> route, unless we introduce a generic and clean/safe function
> inverting share_xen_page_with_guest() (which I've tried in the past
> and failed).
> 
> Jan

Thank-you very much for explaining this to me. I will update the
cleanup procedure and let you know the outcome.

Best regards,
Petre
Paul Durrant July 19, 2019, 7:56 a.m. UTC | #14
> -----Original Message-----
> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> Sent: 18 July 2019 14:59
> To: Jan Beulich <JBeulich@suse.com>; Paul Durrant <Paul.Durrant@citrix.com>
> Cc: JulienGrall <julien.grall@arm.com>; Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan
> Cojocaru <rcojocaru@bitdefender.com>; Andrew Cooper <Andrew.Cooper3@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Ian Jackson
> <Ian.Jackson@citrix.com>; Stefano Stabellini <sstabellini@kernel.org>; xen-devel@lists.xenproject.org;
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>; Tim (Xen.org)
> <tim@xen.org>; Wei Liu <wl@xen.org>
> Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> 
> On Wed, 2019-07-17 at 16:32 +0000, Jan Beulich wrote:
> > On 17.07.2019 16:41, Petre Ovidiu PIRCALABU wrote:
> > > On Wed, 2019-07-17 at 10:06 +0000, Jan Beulich wrote:
> > > > On 16.07.2019 19:06, Petre Pircalabu wrote:
> > > > > +static void vm_event_channels_free_buffer(struct
> > > > > vm_event_channels_domain *impl)
> > > > >    {
> > > > > -    vm_event_ring_resume(to_ring(v->domain-
> > > > > >vm_event_monitor));
> > > > > +    int i;
> > > > > +
> > > > > +    vunmap(impl->slots);
> > > > > +    impl->slots = NULL;
> > > > > +
> > > > > +    for ( i = 0; i < impl->nr_frames; i++ )
> > > > > +        free_domheap_page(mfn_to_page(impl->mfn[i]));
> > > > >    }
> > > >
> > > > What guarantees that there are no mappings left of the pages you
> > > > free
> > > > here? Sharing pages with guests is (so far) an (almost)
> > > > irreversible
> > > > action, i.e. they may generally only be freed upon domain
> > > > destruction.
> > > > See gnttab_unpopulate_status_frames() for a case where we
> > > > actually
> > > > make an attempt at freeing such pages (but where we fail the
> > > > request
> > > > in case references are left in place).
> > > >
> > >
> > > I've tested manually 2 cases and they both work (no crashes):
> > > 1: introspected domain starts -> monitor starts -> monitor stops ->
> > > domain stops
> > > 2: introspected domain starts -> monitor starts -> domain stops ->
> > > monitor stops.
> >
> > Well, testing is important, but doing tests like you describe won't
> > catch any misbehaving or malicious monitor domain.
> >
> > > However, I will take a closer look at
> > > gnttab_unpopulate_status_frames
> > > and post a follow up email.
> >
> > Thanks.
> >
> Hi Jan,
> 
> Could you help me clarify some things? Maybe am approaching the whole
> problem incorrectly.
> 
> To help explaining things a little better I will use the following
> abbreviations:
> ID - introspected domain (the domain for which the vm_event requests
> are generated)
> MD - monitor domain (the domain which handles the requests and posts
> the responses)
> 
> The legacy approach (ring) is to have a dedicated gfn in ID (ring
> page), which is mapped by XEN using __map_domain_page_global and then
> MD use xc_map_foreign_pages to create the mapping and
> xc_domain_decrease_reservation_exact to remove the page from ID's
> physmap.
> The are a number of problems with this approach, the most impactfull
> being that guests with a high number of vcpus will fills-up the ring
> quite quicly. This and the fact vm_event_request size increases as
> monitor applications become more complex incur idle times for vcpus
> waiting to post a request.
> 
> To alleviate this problem I need to have a number of frames shared
> between the hypervisor and MD. The ID doesn't need to know about those
> frames because it will never access this memory area (unlike ioreq who
> intercepts the access to certain addresses).
> 
> Before using xenforeignmemory_map_resource I investigated several
> different approaches:
> - Allocate the memory in hypervisor and xc_map_foreign_pages (doesn't
> work because you cannot "foreignmap" pages of your own domain.
> - Allocate the memory in guest, and map the allocated pages in XEN. To
> my knowledge there is no such API in linux to do this and the monitor
> application, as an userspace program, is not aware of the actual gfns
> for an allocated memory area.
> 
> So, at this point the most promising solution is allocating the memory
> in XEN, sharing it with ID using share_xen_page_with_guest, and mapping
> it with xenforeignmemory_map_resource (with the flag
> XENMEM_rsrc_acq_caller_owned set)

If that page is shared with the ID then XENMEM_rsrc_acq_caller_owned should *not* be set. Also, that flag is an 'out' flag... the caller doesn't decide who owns the resource. TBH I regret ever introducing the flag; it caused a lot of problems, which is why it is no longer used.

  Paul
Jan Beulich July 19, 2019, 8:22 a.m. UTC | #15
On 19.07.2019 09:56, Paul Durrant wrote:
>> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
>> Sent: 18 July 2019 14:59
>>
>> So, at this point the most promising solution is allocating the memory
>> in XEN, sharing it with ID using share_xen_page_with_guest, and mapping
>> it with xenforeignmemory_map_resource (with the flag
>> XENMEM_rsrc_acq_caller_owned set)
> 
> If that page is shared with the ID then XENMEM_rsrc_acq_caller_owned
> should *not* be set. Also, that flag is an 'out' flag... the caller
> doesn't decide who owns the resource.

I had implied that it's really MD that's meant here, but maybe I was
wrong doing so.

> TBH I regret ever introducing the flag; it caused a lot of problems,
> which is why it is no longer used.

It's a tools only interface - why don't we drop the flag if you now
think it was a bad idea to introduce it?

Jan
Paul Durrant July 19, 2019, 8:26 a.m. UTC | #16
> -----Original Message-----
> From: Jan Beulich <JBeulich@suse.com>
> Sent: 19 July 2019 09:22
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: 'Petre Ovidiu PIRCALABU' <ppircalabu@bitdefender.com>; JulienGrall <julien.grall@arm.com>;
> Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan Cojocaru <rcojocaru@bitdefender.com>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Ian Jackson
> <Ian.Jackson@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>; Stefano Stabellini
> <sstabellini@kernel.org>; xen-devel@lists.xenproject.org; KonradRzeszutek Wilk
> <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>; Tim (Xen.org) <tim@xen.org>; Wei Liu
> <wl@xen.org>
> Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> 
> On 19.07.2019 09:56, Paul Durrant wrote:
> >> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> >> Sent: 18 July 2019 14:59
> >>
> >> So, at this point the most promising solution is allocating the memory
> >> in XEN, sharing it with ID using share_xen_page_with_guest, and mapping
> >> it with xenforeignmemory_map_resource (with the flag
> >> XENMEM_rsrc_acq_caller_owned set)
> >
> > If that page is shared with the ID then XENMEM_rsrc_acq_caller_owned
> > should *not* be set. Also, that flag is an 'out' flag... the caller
> > doesn't decide who owns the resource.
> 
> I had implied that it's really MD that's meant here, but maybe I was
> wrong doing so.
> 
> > TBH I regret ever introducing the flag; it caused a lot of problems,
> > which is why it is no longer used.
> 
> It's a tools only interface - why don't we drop the flag if you now
> think it was a bad idea to introduce it?

I was indeed thinking I should find enough tuits to do that in the near future.

  Paul

> 
> Jan
Petre Ovidiu PIRCALABU July 19, 2019, 11:23 a.m. UTC | #17
On Fri, 2019-07-19 at 08:26 +0000, Paul Durrant wrote:
> > -----Original Message-----
> > From: Jan Beulich <JBeulich@suse.com>
> > Sent: 19 July 2019 09:22
> > To: Paul Durrant <Paul.Durrant@citrix.com>
> > Cc: 'Petre Ovidiu PIRCALABU' <ppircalabu@bitdefender.com>;
> > JulienGrall <julien.grall@arm.com>;
> > Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan Cojocaru
> > <rcojocaru@bitdefender.com>; Andrew
> > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <
> > George.Dunlap@citrix.com>; Ian Jackson
> > <Ian.Jackson@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>;
> > Stefano Stabellini
> > <sstabellini@kernel.org>; xen-devel@lists.xenproject.org;
> > KonradRzeszutek Wilk
> > <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>;
> > Tim (Xen.org) <tim@xen.org>; Wei Liu
> > <wl@xen.org>
> > Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> > 
> > On 19.07.2019 09:56, Paul Durrant wrote:
> > > > From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> > > > Sent: 18 July 2019 14:59
> > > > 
> > > > So, at this point the most promising solution is allocating the
> > > > memory
> > > > in XEN, sharing it with ID using share_xen_page_with_guest, and
> > > > mapping
> > > > it with xenforeignmemory_map_resource (with the flag
> > > > XENMEM_rsrc_acq_caller_owned set)
> > > 
> > > If that page is shared with the ID then
> > > XENMEM_rsrc_acq_caller_owned
> > > should *not* be set. Also, that flag is an 'out' flag... the
> > > caller
> > > doesn't decide who owns the resource.
> > 
> > I had implied that it's really MD that's meant here, but maybe I
> > was
> > wrong doing so.
> > 
> > > TBH I regret ever introducing the flag; it caused a lot of
> > > problems,
> > > which is why it is no longer used.
> > 
> > It's a tools only interface - why don't we drop the flag if you now
> > think it was a bad idea to introduce it?
> 
> I was indeed thinking I should find enough tuits to do that in the
> near future.
> 
>   Paul
> 
> > 
> > Jan
Sorry, my mistake. I meant to say it's shared with MD. 

Many thanks for your support,
Petre
Paul Durrant July 19, 2019, 12:11 p.m. UTC | #18
> -----Original Message-----
> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> Sent: 19 July 2019 12:24
> To: Paul Durrant <Paul.Durrant@citrix.com>; 'Jan Beulich' <JBeulich@suse.com>
> Cc: JulienGrall <julien.grall@arm.com>; Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan
> Cojocaru <rcojocaru@bitdefender.com>; Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap
> <George.Dunlap@citrix.com>; Ian Jackson <Ian.Jackson@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Stefano Stabellini <sstabellini@kernel.org>; xen-devel@lists.xenproject.org;
> KonradRzeszutek Wilk <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>; Tim (Xen.org)
> <tim@xen.org>; Wei Liu <wl@xen.org>
> Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> 
> On Fri, 2019-07-19 at 08:26 +0000, Paul Durrant wrote:
> > > -----Original Message-----
> > > From: Jan Beulich <JBeulich@suse.com>
> > > Sent: 19 July 2019 09:22
> > > To: Paul Durrant <Paul.Durrant@citrix.com>
> > > Cc: 'Petre Ovidiu PIRCALABU' <ppircalabu@bitdefender.com>;
> > > JulienGrall <julien.grall@arm.com>;
> > > Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan Cojocaru
> > > <rcojocaru@bitdefender.com>; Andrew
> > > Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <
> > > George.Dunlap@citrix.com>; Ian Jackson
> > > <Ian.Jackson@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>;
> > > Stefano Stabellini
> > > <sstabellini@kernel.org>; xen-devel@lists.xenproject.org;
> > > KonradRzeszutek Wilk
> > > <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>;
> > > Tim (Xen.org) <tim@xen.org>; Wei Liu
> > > <wl@xen.org>
> > > Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> > >
> > > On 19.07.2019 09:56, Paul Durrant wrote:
> > > > > From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> > > > > Sent: 18 July 2019 14:59
> > > > >
> > > > > So, at this point the most promising solution is allocating the
> > > > > memory
> > > > > in XEN, sharing it with ID using share_xen_page_with_guest, and
> > > > > mapping
> > > > > it with xenforeignmemory_map_resource (with the flag
> > > > > XENMEM_rsrc_acq_caller_owned set)
> > > >
> > > > If that page is shared with the ID then
> > > > XENMEM_rsrc_acq_caller_owned
> > > > should *not* be set. Also, that flag is an 'out' flag... the
> > > > caller
> > > > doesn't decide who owns the resource.
> > >
> > > I had implied that it's really MD that's meant here, but maybe I
> > > was
> > > wrong doing so.
> > >
> > > > TBH I regret ever introducing the flag; it caused a lot of
> > > > problems,
> > > > which is why it is no longer used.
> > >
> > > It's a tools only interface - why don't we drop the flag if you now
> > > think it was a bad idea to introduce it?
> >
> > I was indeed thinking I should find enough tuits to do that in the
> > near future.
> >
> >   Paul
> >
> > >
> > > Jan
> Sorry, my mistake. I meant to say it's shared with MD.
> 
> Many thanks for your support,

Ok, in that case please share with the ID instead. I found some tuits this morning and am about to a submit a patch.

  Paul
Jan Beulich July 19, 2019, 12:32 p.m. UTC | #19
On 19.07.2019 14:11, Paul Durrant wrote:
>> -----Original Message-----
>> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
>> Sent: 19 July 2019 12:24
>>
>> On Fri, 2019-07-19 at 08:26 +0000, Paul Durrant wrote:
>>>> From: Jan Beulich <JBeulich@suse.com>
>>>> Sent: 19 July 2019 09:22
>>>>
>>>> On 19.07.2019 09:56, Paul Durrant wrote:
>>>>> If that page is shared with the ID then
>>>>> XENMEM_rsrc_acq_caller_owned
>>>>> should *not* be set. Also, that flag is an 'out' flag... the
>>>>> caller
>>>>> doesn't decide who owns the resource.
>>>>
>>>> I had implied that it's really MD that's meant here, but maybe I
>>>> was
>>>> wrong doing so.
>>>>
>>>>> TBH I regret ever introducing the flag; it caused a lot of
>>>>> problems,
>>>> which is why it is no longer used.
>>>>
>>>> It's a tools only interface - why don't we drop the flag if you now
>>>> think it was a bad idea to introduce it?
>>>
>>> I was indeed thinking I should find enough tuits to do that in the
>>> near future.
>>>
>> Sorry, my mistake. I meant to say it's shared with MD.
>>
>> Many thanks for your support,
> 
> Ok, in that case please share with the ID instead.

But that's exactly what we want to avoid: If sharing at all, then
please with the more privileged entity. How would MD access the page
if it's shared with ID (which, aiui, has no business accessing the
page at all)?

Jan
Paul Durrant July 19, 2019, 12:37 p.m. UTC | #20
> -----Original Message-----
> From: Jan Beulich <JBeulich@suse.com>
> Sent: 19 July 2019 13:32
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: 'Petre Ovidiu PIRCALABU' <ppircalabu@bitdefender.com>; JulienGrall <julien.grall@arm.com>;
> Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan Cojocaru <rcojocaru@bitdefender.com>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Ian Jackson
> <Ian.Jackson@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>; Stefano Stabellini
> <sstabellini@kernel.org>; xen-devel@lists.xenproject.org; KonradRzeszutek Wilk
> <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>; Tim (Xen.org) <tim@xen.org>; Wei Liu
> <wl@xen.org>
> Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> 
> On 19.07.2019 14:11, Paul Durrant wrote:
> >> -----Original Message-----
> >> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> >> Sent: 19 July 2019 12:24
> >>
> >> On Fri, 2019-07-19 at 08:26 +0000, Paul Durrant wrote:
> >>>> From: Jan Beulich <JBeulich@suse.com>
> >>>> Sent: 19 July 2019 09:22
> >>>>
> >>>> On 19.07.2019 09:56, Paul Durrant wrote:
> >>>>> If that page is shared with the ID then
> >>>>> XENMEM_rsrc_acq_caller_owned
> >>>>> should *not* be set. Also, that flag is an 'out' flag... the
> >>>>> caller
> >>>>> doesn't decide who owns the resource.
> >>>>
> >>>> I had implied that it's really MD that's meant here, but maybe I
> >>>> was
> >>>> wrong doing so.
> >>>>
> >>>>> TBH I regret ever introducing the flag; it caused a lot of
> >>>>> problems,
> >>>> which is why it is no longer used.
> >>>>
> >>>> It's a tools only interface - why don't we drop the flag if you now
> >>>> think it was a bad idea to introduce it?
> >>>
> >>> I was indeed thinking I should find enough tuits to do that in the
> >>> near future.
> >>>
> >> Sorry, my mistake. I meant to say it's shared with MD.
> >>
> >> Many thanks for your support,
> >
> > Ok, in that case please share with the ID instead.
> 
> But that's exactly what we want to avoid: If sharing at all, then
> please with the more privileged entity.

Why? We're talking HVM guests only here IIUC so this is equivalent to IOREQ server... The pages are target assigned so that foreign mapping works, but protected from the guest itself because they are never in the P2M.

> How would MD access the page
> if it's shared with ID (which, aiui, has no business accessing the
> page at all)?
> 

Using foreign mapping in the same way as IOREQ server. Otherwise we are back into the ref counting and general accounting hell that target assignment avoids. I agree that a better long term solution is probably desirable but I don't honestly know what that would look like.

  Paul

> Jan
Jan Beulich July 19, 2019, 12:59 p.m. UTC | #21
On 19.07.2019 14:37, Paul Durrant wrote:
>> From: Jan Beulich <JBeulich@suse.com>
>> Sent: 19 July 2019 13:32
>>
>> On 19.07.2019 14:11, Paul Durrant wrote:
>>>> -----Original Message-----
>>>> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
>>>> Sent: 19 July 2019 12:24
>>>>
>>>> Sorry, my mistake. I meant to say it's shared with MD.
>>>>
>>>> Many thanks for your support,
>>>
>>> Ok, in that case please share with the ID instead.
>>
>> But that's exactly what we want to avoid: If sharing at all, then
>> please with the more privileged entity.
> 
> Why? We're talking HVM guests only here IIUC so this is equivalent
> to IOREQ server...

Not sure: The main vm_event.c files live in common/ and arch/x86/
respectively, so I thought at least architecturally VM events were
possible for PV as well. If it's indeed HVM-only, then following
the IOREQ server model in its entirety would of course be fine.

Jan
Petre Ovidiu PIRCALABU July 19, 2019, 5:40 p.m. UTC | #22
On Fri, 2019-07-19 at 12:59 +0000, Jan Beulich wrote:
> On 19.07.2019 14:37, Paul Durrant wrote:
> > > From: Jan Beulich <JBeulich@suse.com>
> > > Sent: 19 July 2019 13:32
> > > 
> > > On 19.07.2019 14:11, Paul Durrant wrote:
> > > > > -----Original Message-----
> > > > > From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> > > > > Sent: 19 July 2019 12:24
> > > > > 
> > > > > Sorry, my mistake. I meant to say it's shared with MD.
> > > > > 
> > > > > Many thanks for your support,
> > > > 
> > > > Ok, in that case please share with the ID instead.
> > > 
> > > But that's exactly what we want to avoid: If sharing at all, then
> > > please with the more privileged entity.
> > 
> > Why? We're talking HVM guests only here IIUC so this is equivalent
> > to IOREQ server...
> 
> Not sure: The main vm_event.c files live in common/ and arch/x86/
> respectively, so I thought at least architecturally VM events were
> possible for PV as well. If it's indeed HVM-only, then following
> the IOREQ server model in its entirety would of course be fine.
> 
> Jan

In one of the previous version of the patchset there was a suggestion
to implement the new vm_event transport using IOREQ, but it was dropped
.

https://lists.xenproject.org/archives/html/xen-devel/2019-04/msg00173.html

Also, unless there isn't a proper way allocate the necessary pages, I
wouldn't introduce a HVM-only limitation because, other than the HVM
param used to keep track of the ring pfn, the vm_event mechanism is
quite generic.

Many thanks for your support,
Petre
Paul Durrant July 22, 2019, 7:58 a.m. UTC | #23
> -----Original Message-----
> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> Sent: 19 July 2019 18:40
> To: Jan Beulich <JBeulich@suse.com>; Paul Durrant <Paul.Durrant@citrix.com>; Andrew Cooper
> <Andrew.Cooper3@citrix.com>
> Cc: JulienGrall <julien.grall@arm.com>; Alexandru Stefan ISAILA <aisaila@bitdefender.com>; Razvan
> Cojocaru <rcojocaru@bitdefender.com>; George Dunlap <George.Dunlap@citrix.com>; Ian Jackson
> <Ian.Jackson@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>; Stefano Stabellini
> <sstabellini@kernel.org>; xen-devel@lists.xenproject.org; KonradRzeszutek Wilk
> <konrad.wilk@oracle.com>; Tamas K Lengyel <tamas@tklengyel.com>; Tim (Xen.org) <tim@xen.org>; Wei Liu
> <wl@xen.org>
> Subject: Re: [PATCH v2 07/10] vm_event: Add vm_event_ng interface
> 
> On Fri, 2019-07-19 at 12:59 +0000, Jan Beulich wrote:
> > On 19.07.2019 14:37, Paul Durrant wrote:
> > > > From: Jan Beulich <JBeulich@suse.com>
> > > > Sent: 19 July 2019 13:32
> > > >
> > > > On 19.07.2019 14:11, Paul Durrant wrote:
> > > > > > -----Original Message-----
> > > > > > From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> > > > > > Sent: 19 July 2019 12:24
> > > > > >
> > > > > > Sorry, my mistake. I meant to say it's shared with MD.
> > > > > >
> > > > > > Many thanks for your support,
> > > > >
> > > > > Ok, in that case please share with the ID instead.
> > > >
> > > > But that's exactly what we want to avoid: If sharing at all, then
> > > > please with the more privileged entity.
> > >
> > > Why? We're talking HVM guests only here IIUC so this is equivalent
> > > to IOREQ server...
> >
> > Not sure: The main vm_event.c files live in common/ and arch/x86/
> > respectively, so I thought at least architecturally VM events were
> > possible for PV as well. If it's indeed HVM-only, then following
> > the IOREQ server model in its entirety would of course be fine.
> >
> > Jan
> 
> In one of the previous version of the patchset there was a suggestion
> to implement the new vm_event transport using IOREQ, but it was dropped
> .
> 
> https://lists.xenproject.org/archives/html/xen-devel/2019-04/msg00173.html
> 
> Also, unless there isn't a proper way allocate the necessary pages, I
> wouldn't introduce a HVM-only limitation because, other than the HVM
> param used to keep track of the ring pfn, the vm_event mechanism is
> quite generic.

Well, with resource mapping you wouldn't need the HVM param, presumably. IMO it would be best to make this HVM only in the first instance, to avoid blocking progress. This case does highlight the need for a solution to the issue of resource accounting for PV guests. My suggestion is a page type that can be assigned to a guest but cannot be mapped to that guest... something akin to a page table perhaps?

  Paul

> 
> Many thanks for your support,
> Petre
Jan Beulich July 22, 2019, 7:59 a.m. UTC | #24
On 19.07.2019 19:40, Petre Ovidiu PIRCALABU wrote:
> On Fri, 2019-07-19 at 12:59 +0000, Jan Beulich wrote:
>> On 19.07.2019 14:37, Paul Durrant wrote:
>>>> From: Jan Beulich <JBeulich@suse.com>
>>>> Sent: 19 July 2019 13:32
>>>>
>>>> On 19.07.2019 14:11, Paul Durrant wrote:
>>>>>> -----Original Message-----
>>>>>> From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
>>>>>> Sent: 19 July 2019 12:24
>>>>>>
>>>>>> Sorry, my mistake. I meant to say it's shared with MD.
>>>>>>
>>>>>> Many thanks for your support,
>>>>>
>>>>> Ok, in that case please share with the ID instead.
>>>>
>>>> But that's exactly what we want to avoid: If sharing at all, then
>>>> please with the more privileged entity.
>>>
>>> Why? We're talking HVM guests only here IIUC so this is equivalent
>>> to IOREQ server...
>>
>> Not sure: The main vm_event.c files live in common/ and arch/x86/
>> respectively, so I thought at least architecturally VM events were
>> possible for PV as well. If it's indeed HVM-only, then following
>> the IOREQ server model in its entirety would of course be fine.
> 
> In one of the previous version of the patchset there was a suggestion
> to implement the new vm_event transport using IOREQ, but it was dropped
> .
> 
> https://lists.xenproject.org/archives/html/xen-devel/2019-04/msg00173.html

And validly so (imo), not the least because of also being HVM specific.

> Also, unless there isn't a proper way allocate the necessary pages, I
> wouldn't introduce a HVM-only limitation because, other than the HVM
> param used to keep track of the ring pfn, the vm_event mechanism is
> quite generic.

By "wouldn't introduce" do you mean "wouldn't want to introduce" or do
you mean to say you in fact wouldn't and I'm not seeing why that is?

Jan
Petre Ovidiu PIRCALABU July 22, 2019, 10:44 a.m. UTC | #25
On Mon, 2019-07-22 at 07:59 +0000, Jan Beulich wrote:
> On 19.07.2019 19:40, Petre Ovidiu PIRCALABU wrote:
> > On Fri, 2019-07-19 at 12:59 +0000, Jan Beulich wrote:
> > > On 19.07.2019 14:37, Paul Durrant wrote:
> > > > > From: Jan Beulich <JBeulich@suse.com>
> > > > > Sent: 19 July 2019 13:32
> > > > > 
> > > > > On 19.07.2019 14:11, Paul Durrant wrote:
> > > > > > > -----Original Message-----
> > > > > > > From: Petre Ovidiu PIRCALABU <ppircalabu@bitdefender.com>
> > > > > > > Sent: 19 July 2019 12:24
> > > > > > > 
> > > > > > > Sorry, my mistake. I meant to say it's shared with MD.
> > > > > > > 
> > > > > > > Many thanks for your support,
> > > > > > 
> > > > > > Ok, in that case please share with the ID instead.
> > > > > 
> > > > > But that's exactly what we want to avoid: If sharing at all,
> > > > > then
> > > > > please with the more privileged entity.
> > > > 
> > > > Why? We're talking HVM guests only here IIUC so this is
> > > > equivalent
> > > > to IOREQ server...
> > > 
> > > Not sure: The main vm_event.c files live in common/ and arch/x86/
> > > respectively, so I thought at least architecturally VM events
> > > were
> > > possible for PV as well. If it's indeed HVM-only, then following
> > > the IOREQ server model in its entirety would of course be fine.
> > 
> > In one of the previous version of the patchset there was a
> > suggestion
> > to implement the new vm_event transport using IOREQ, but it was
> > dropped
> > .
> > 
> > 
https://lists.xenproject.org/archives/html/xen-devel/2019-04/msg00173.html
> 
> And validly so (imo), not the least because of also being HVM
> specific.
> 
> > Also, unless there isn't a proper way allocate the necessary pages,
> > I
> > wouldn't introduce a HVM-only limitation because, other than the
> > HVM
> > param used to keep track of the ring pfn, the vm_event mechanism is
> > quite generic.
> 
> By "wouldn't introduce" do you mean "wouldn't want to introduce" or
> do
> you mean to say you in fact wouldn't and I'm not seeing why that is?
> 
> Jan

Well, I think "I would prefer not to" would have been better. The main
ideea is that I wouldn't want to add a limitation to the applicability
of this feature unless it's the only possible solution.

Many thanks for your support,
Petre
Petre Ovidiu PIRCALABU July 31, 2019, 12:53 p.m. UTC | #26
On Thu, 2019-07-18 at 14:44 +0000, Jan Beulich wrote:
> On 18.07.2019 15:59, Petre Ovidiu PIRCALABU wrote:
> > Before using xenforeignmemory_map_resource I investigated several
> > different approaches:
> > - Allocate the memory in hypervisor and xc_map_foreign_pages
> > (doesn't
> > work because you cannot "foreignmap" pages of your own domain.
> > - Allocate the memory in guest, and map the allocated pages in XEN.
> > To
> > my knowledge there is no such API in linux to do this and the
> > monitor
> > application, as an userspace program, is not aware of the actual
> > gfns
> > for an allocated memory area.
> > 
> > So, at this point the most promising solution is allocating the
> > memory
> > in XEN, sharing it with ID using share_xen_page_with_guest, and
> > mapping
> > it with xenforeignmemory_map_resource (with the flag
> > XENMEM_rsrc_acq_caller_owned set)
> 
> Which is fine - that's why Paul had introduced the generic interface.
> 
> > To my understanding the cleanup sequence from
> > gnttab_unpopulate_status_frames basically boils down to:
> > 1. guest_physmap_remove_page
> > 2. if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) )
> >         put_page(pg);
> > 3. free_xenheap_page
> 
> You're missing the crucial part of undoing step 2 if you find
> that there are still references left to the page.
> 
> And then, because of your use of vzalloc(), you can't use
> free_xenheap_pages(), as that takes a (direct map) linear address
> as input. It has to be free_domheap_pages() in your case.
> 
> > My current implementation uses vzalloc instead of
> > alloc_xenheap_pages
> > and that's why I assumed vunmap and free_domheap_pages will suffice
> > (I
> > would have called vfree directly, but the temporary linked list
> > that is
> > used to hold the page references causes free_domheap_pages to
> > crash)
> > 
> > Do I also have to call guest_physmap_remove_page and put_page?
> > (steps
> > 1. and 2.)
> 
> guest_physmap_remove_page() needs to be called for translated page-
> owning domains if the page was ever added to their physmap. As long
> as you avoid adding, you also don't need to remove. I don't recall
> though whether a translated domain can access a resource without it
> having a representation in its GFN space.
> 
I've traced the GFN value for the shared MFN and it's invalid. It's set
to INVALID_M2P_ENTRY in share_xen_page_with_guest, but I was expecting
it to be set to valid value later on (e.g. xenmem_add_to_physmap). 
Am I missing something? 

Many thanks,
Petre
Jan Beulich July 31, 2019, 1:09 p.m. UTC | #27
On 31.07.2019 14:53, Petre Ovidiu PIRCALABU wrote:
> On Thu, 2019-07-18 at 14:44 +0000, Jan Beulich wrote:
>> On 18.07.2019 15:59, Petre Ovidiu PIRCALABU wrote:
>>> Before using xenforeignmemory_map_resource I investigated several
>>> different approaches:
>>> - Allocate the memory in hypervisor and xc_map_foreign_pages
>>> (doesn't
>>> work because you cannot "foreignmap" pages of your own domain.
>>> - Allocate the memory in guest, and map the allocated pages in XEN.
>>> To
>>> my knowledge there is no such API in linux to do this and the
>>> monitor
>>> application, as an userspace program, is not aware of the actual
>>> gfns
>>> for an allocated memory area.
>>>
>>> So, at this point the most promising solution is allocating the
>>> memory
>>> in XEN, sharing it with ID using share_xen_page_with_guest, and
>>> mapping
>>> it with xenforeignmemory_map_resource (with the flag
>>> XENMEM_rsrc_acq_caller_owned set)
>>
>> Which is fine - that's why Paul had introduced the generic interface.
>>
>>> To my understanding the cleanup sequence from
>>> gnttab_unpopulate_status_frames basically boils down to:
>>> 1. guest_physmap_remove_page
>>> 2. if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) )
>>>          put_page(pg);
>>> 3. free_xenheap_page
>>
>> You're missing the crucial part of undoing step 2 if you find
>> that there are still references left to the page.
>>
>> And then, because of your use of vzalloc(), you can't use
>> free_xenheap_pages(), as that takes a (direct map) linear address
>> as input. It has to be free_domheap_pages() in your case.
>>
>>> My current implementation uses vzalloc instead of
>>> alloc_xenheap_pages
>>> and that's why I assumed vunmap and free_domheap_pages will suffice
>>> (I
>>> would have called vfree directly, but the temporary linked list
>>> that is
>>> used to hold the page references causes free_domheap_pages to
>>> crash)
>>>
>>> Do I also have to call guest_physmap_remove_page and put_page?
>>> (steps
>>> 1. and 2.)
>>
>> guest_physmap_remove_page() needs to be called for translated page-
>> owning domains if the page was ever added to their physmap. As long
>> as you avoid adding, you also don't need to remove. I don't recall
>> though whether a translated domain can access a resource without it
>> having a representation in its GFN space.
>>
> I've traced the GFN value for the shared MFN and it's invalid. It's set
> to INVALID_M2P_ENTRY in share_xen_page_with_guest, but I was expecting
> it to be set to valid value later on (e.g. xenmem_add_to_physmap).
> Am I missing something?

By "traced" do you mean "observed" (e.g. by way of logging) or "gone
through the code to verify it can't ever become valid"? In the latter
case you'd have proven what you want/need. Thinking about it though,
seeing how xenmem_add_to_physmap_one() works and assuming you indeed
don't add the page by other means, it looks to me as if you've done
more than is needed, as I've said already that you need to remove the
page only if it had been added.

Jan
diff mbox series

Patch

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index f3af710..1293b0f 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -128,6 +128,7 @@  enum xc_error_code {
 
 typedef enum xc_error_code xc_error_code;
 
+struct xenforeignmemory_resource_handle;
 
 /*
  *  INITIALIZATION FUNCTIONS
@@ -2007,6 +2008,14 @@  int xc_vm_event_get_version(xc_interface *xch);
 void *xc_monitor_enable(xc_interface *xch, uint32_t domain_id, uint32_t *port);
 int xc_monitor_disable(xc_interface *xch, uint32_t domain_id);
 int xc_monitor_resume(xc_interface *xch, uint32_t domain_id);
+
+/* Monitor NG interface */
+int xc_monitor_ng_enable(xc_interface *xch, uint32_t domain_id,
+                         struct xenforeignmemory_resource_handle **fres,
+                         int *num_channels, void **p_addr);
+int xc_monitor_ng_disable(xc_interface *xch, uint32_t domain_id,
+                          struct xenforeignmemory_resource_handle **fres);
+
 /*
  * Get a bitmap of supported monitor events in the form
  * (1 << XEN_DOMCTL_MONITOR_EVENT_*).
diff --git a/tools/libxc/xc_mem_paging.c b/tools/libxc/xc_mem_paging.c
index a88c0cc..978008a 100644
--- a/tools/libxc/xc_mem_paging.c
+++ b/tools/libxc/xc_mem_paging.c
@@ -49,7 +49,7 @@  int xc_mem_paging_enable(xc_interface *xch, uint32_t domain_id,
     return xc_vm_event_control(xch, domain_id,
                                XEN_VM_EVENT_ENABLE,
                                XEN_VM_EVENT_TYPE_PAGING,
-                               port);
+                               0, port);
 }
 
 int xc_mem_paging_disable(xc_interface *xch, uint32_t domain_id)
@@ -57,15 +57,12 @@  int xc_mem_paging_disable(xc_interface *xch, uint32_t domain_id)
     return xc_vm_event_control(xch, domain_id,
                                XEN_VM_EVENT_DISABLE,
                                XEN_VM_EVENT_TYPE_PAGING,
-                               NULL);
+                               0, NULL);
 }
 
 int xc_mem_paging_resume(xc_interface *xch, uint32_t domain_id)
 {
-    return xc_vm_event_control(xch, domain_id,
-                               XEN_VM_EVENT_RESUME,
-                               XEN_VM_EVENT_TYPE_PAGING,
-                               NULL);
+    return xc_vm_event_resume(xch, domain_id, XEN_VM_EVENT_TYPE_PAGING, 0);
 }
 
 int xc_mem_paging_nominate(xc_interface *xch, uint32_t domain_id, uint64_t gfn)
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 1c4a706..44d4f23 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -54,7 +54,7 @@  int xc_memshr_ring_enable(xc_interface *xch,
     return xc_vm_event_control(xch, domid,
                                XEN_VM_EVENT_ENABLE,
                                XEN_VM_EVENT_TYPE_SHARING,
-                               port);
+                               0, port);
 }
 
 int xc_memshr_ring_disable(xc_interface *xch, 
@@ -63,7 +63,7 @@  int xc_memshr_ring_disable(xc_interface *xch,
     return xc_vm_event_control(xch, domid,
                                XEN_VM_EVENT_DISABLE,
                                XEN_VM_EVENT_TYPE_SHARING,
-                               NULL);
+                               0, NULL);
 }
 
 static int xc_memshr_memop(xc_interface *xch, uint32_t domid,
@@ -203,10 +203,7 @@  int xc_memshr_range_share(xc_interface *xch,
 int xc_memshr_domain_resume(xc_interface *xch,
                             uint32_t domid)
 {
-    return xc_vm_event_control(xch, domid,
-                               XEN_VM_EVENT_RESUME,
-                               XEN_VM_EVENT_TYPE_SHARING,
-                               NULL);
+    return xc_vm_event_resume(xch, domid, XEN_VM_EVENT_TYPE_SHARING, 0);
 }
 
 int xc_memshr_debug_gfn(xc_interface *xch,
diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
index f05b53d..d8d62c4 100644
--- a/tools/libxc/xc_monitor.c
+++ b/tools/libxc/xc_monitor.c
@@ -33,15 +33,12 @@  int xc_monitor_disable(xc_interface *xch, uint32_t domain_id)
     return xc_vm_event_control(xch, domain_id,
                                XEN_VM_EVENT_DISABLE,
                                XEN_VM_EVENT_TYPE_MONITOR,
-                               NULL);
+                               0, NULL);
 }
 
 int xc_monitor_resume(xc_interface *xch, uint32_t domain_id)
 {
-    return xc_vm_event_control(xch, domain_id,
-                               XEN_VM_EVENT_RESUME,
-                               XEN_VM_EVENT_TYPE_MONITOR,
-                               NULL);
+    return xc_vm_event_resume(xch, domain_id, XEN_VM_EVENT_TYPE_MONITOR, 0);
 }
 
 int xc_monitor_get_capabilities(xc_interface *xch, uint32_t domain_id,
@@ -246,6 +243,22 @@  int xc_monitor_emul_unimplemented(xc_interface *xch, uint32_t domain_id,
     return do_domctl(xch, &domctl);
 }
 
+int xc_monitor_ng_enable(xc_interface *xch, uint32_t domain_id,
+                         xenforeignmemory_resource_handle **fres,
+                         int *num_channels, void **p_addr)
+{
+    return xc_vm_event_ng_enable(xch, domain_id, XEN_VM_EVENT_TYPE_MONITOR,
+                                 fres, num_channels, p_addr);
+}
+
+
+int xc_monitor_ng_disable(xc_interface *xch, uint32_t domain_id,
+                          xenforeignmemory_resource_handle **fres)
+{
+    return xc_vm_event_ng_disable(xch, domain_id, XEN_VM_EVENT_TYPE_MONITOR,
+                                  fres);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index e4f7c3a..9cd6069 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -412,13 +412,23 @@  int xc_ffs64(uint64_t x);
  * vm_event operations. Internal use only.
  */
 int xc_vm_event_control(xc_interface *xch, uint32_t domain_id, unsigned int op,
-                        unsigned int type, uint32_t *port);
+                        unsigned int type, unsigned int flags, uint32_t *port);
+int xc_vm_event_resume(xc_interface *xch, uint32_t domain_id, unsigned int type,
+                       unsigned int flags);
 /*
  * Enables vm_event and returns the mapped ring page indicated by type.
  * type can be XEN_VM_EVENT_TYPE_(PAGING/MONITOR/SHARING)
  */
 void *xc_vm_event_enable(xc_interface *xch, uint32_t domain_id, int type,
                          uint32_t *port);
+/*
+ * Enables/Disables vm_event using the new interface.
+ */
+int xc_vm_event_ng_enable(xc_interface *xch, uint32_t domain_id, int type,
+                          xenforeignmemory_resource_handle **fres,
+                          int *num_channels, void **p_addr);
+int xc_vm_event_ng_disable(xc_interface *xch, uint32_t domain_id, int type,
+                          xenforeignmemory_resource_handle **fres);
 
 int do_dm_op(xc_interface *xch, uint32_t domid, unsigned int nr_bufs, ...);
 
diff --git a/tools/libxc/xc_vm_event.c b/tools/libxc/xc_vm_event.c
index 044bf71..d070e64 100644
--- a/tools/libxc/xc_vm_event.c
+++ b/tools/libxc/xc_vm_event.c
@@ -22,8 +22,12 @@ 
 
 #include "xc_private.h"
 
+#ifndef PFN_UP
+#define PFN_UP(x)     (((x) + XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT)
+#endif /* PFN_UP */
+
 int xc_vm_event_control(xc_interface *xch, uint32_t domain_id, unsigned int op,
-                        unsigned int type, uint32_t *port)
+                        unsigned int type, unsigned int flags, uint32_t *port)
 {
     DECLARE_DOMCTL;
     int rc;
@@ -32,6 +36,7 @@  int xc_vm_event_control(xc_interface *xch, uint32_t domain_id, unsigned int op,
     domctl.domain = domain_id;
     domctl.u.vm_event_op.op = op;
     domctl.u.vm_event_op.type = type;
+    domctl.u.vm_event_op.flags = flags;
 
     rc = do_domctl(xch, &domctl);
     if ( !rc && port )
@@ -113,7 +118,7 @@  void *xc_vm_event_enable(xc_interface *xch, uint32_t domain_id, int type,
         goto out;
     }
 
-    rc1 = xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_ENABLE, type, port);
+    rc1 = xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_ENABLE, type, 0, port);
     if ( rc1 != 0 )
     {
         PERROR("Failed to enable vm_event\n");
@@ -164,6 +169,97 @@  int xc_vm_event_get_version(xc_interface *xch)
     return rc;
 }
 
+int xc_vm_event_resume(xc_interface *xch, uint32_t domain_id,
+                       unsigned int type, unsigned int flags)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_vm_event_op;
+    domctl.domain = domain_id;
+    domctl.u.vm_event_op.op = XEN_VM_EVENT_RESUME;
+    domctl.u.vm_event_op.type = type;
+    domctl.u.vm_event_op.flags = flags;
+    domctl.u.vm_event_op.u.resume.vcpu_id = 0;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_vm_event_ng_enable(xc_interface *xch, uint32_t domain_id, int type,
+                          xenforeignmemory_resource_handle **fres,
+                          int *num_channels, void **p_addr)
+{
+    int rc1, rc2;
+    xc_dominfo_t info;
+    unsigned long nr_frames;
+
+    if ( !fres || !num_channels || ! p_addr )
+        return -EINVAL;
+
+    /* Get the numbers of vcpus */
+    if ( xc_domain_getinfo(xch, domain_id, 1, &info) != 1 ||
+         info.domid != domain_id )
+    {
+        PERROR("xc_domain_getinfo failed.\n");
+        return -ESRCH;
+    }
+
+    *num_channels = info.max_vcpu_id + 1;
+
+    rc1 = xc_domain_pause(xch, domain_id);
+    if ( rc1 )
+    {
+        PERROR("Unable to pause domain\n");
+        return rc1;
+    }
+
+    rc1 = xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_ENABLE,
+                              type, XEN_VM_EVENT_FLAGS_NG_OP, NULL);
+    if ( rc1 )
+    {
+        PERROR("Failed to enable vm_event\n");
+        goto out;
+    }
+
+    nr_frames = PFN_UP(*num_channels * sizeof(struct vm_event_slot));
+
+    *fres = xenforeignmemory_map_resource(xch->fmem, domain_id,
+                                          XENMEM_resource_vm_event,
+                                          XEN_VM_EVENT_TYPE_MONITOR, 0,
+                                          nr_frames, p_addr,
+                                          PROT_READ | PROT_WRITE, 0);
+    if ( !*fres )
+    {
+        xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_DISABLE,
+                            type, XEN_VM_EVENT_FLAGS_NG_OP, NULL);
+        ERROR("Failed to map vm_event resource");
+        rc1 = -errno;
+        goto out;
+    }
+
+out:
+    rc2 = xc_domain_unpause(xch, domain_id);
+    if ( rc1 || rc2 )
+    {
+        if ( rc2 )
+            PERROR("Unable to pause domain\n");
+
+        if ( rc1 == 0 )
+            rc1 = rc2;
+    }
+
+    return rc1;
+}
+
+int xc_vm_event_ng_disable(xc_interface *xch, uint32_t domain_id, int type,
+                           xenforeignmemory_resource_handle **fres)
+{
+    xenforeignmemory_unmap_resource(xch->fmem, *fres);
+    *fres = NULL;
+
+    return xc_vm_event_control(xch, domain_id, XEN_VM_EVENT_DISABLE,
+                              type, XEN_VM_EVENT_FLAGS_NG_OP, NULL);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index df2c013..768df4f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -119,6 +119,7 @@ 
 #include <xen/efi.h>
 #include <xen/grant_table.h>
 #include <xen/hypercall.h>
+#include <xen/vm_event.h>
 #include <asm/paging.h>
 #include <asm/shadow.h>
 #include <asm/page.h>
@@ -4555,6 +4556,12 @@  int arch_acquire_resource(struct domain *d, unsigned int type,
     }
 #endif
 
+    case XENMEM_resource_vm_event:
+        rc = vm_event_ng_get_frames(d, id, frame, nr_frames, mfn_list);
+        if ( !rc )
+            *flags |= XENMEM_rsrc_acq_caller_owned;
+        break;
+
     default:
         rc = -EOPNOTSUPP;
         break;
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
index e6a7a29..3f9be97 100644
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -25,6 +25,7 @@ 
 #include <xen/wait.h>
 #include <xen/vm_event.h>
 #include <xen/mem_access.h>
+#include <xen/vmap.h>
 #include <asm/p2m.h>
 #include <asm/monitor.h>
 #include <asm/vm_event.h>
@@ -35,6 +36,78 @@ 
 #define xen_rmb()  smp_rmb()
 #define xen_wmb()  smp_wmb()
 
+static int vm_event_ring_pfn_param(uint32_t type)
+{
+    switch( type )
+    {
+#ifdef CONFIG_HAS_MEM_PAGING
+    case XEN_VM_EVENT_TYPE_PAGING:
+        return HVM_PARAM_PAGING_RING_PFN;
+#endif
+    case XEN_VM_EVENT_TYPE_MONITOR:
+        return HVM_PARAM_MONITOR_RING_PFN;
+#ifdef CONFIG_HAS_MEM_SHARING
+    case XEN_VM_EVENT_TYPE_SHARING:
+        return HVM_PARAM_SHARING_RING_PFN;
+#endif
+    };
+
+    ASSERT_UNREACHABLE();
+    return -1;
+}
+
+static int vm_event_pause_flag(uint32_t type)
+{
+    switch( type )
+    {
+#ifdef CONFIG_HAS_MEM_PAGING
+    case XEN_VM_EVENT_TYPE_PAGING:
+        return _VPF_mem_paging;
+#endif
+    case XEN_VM_EVENT_TYPE_MONITOR:
+        return _VPF_mem_access;
+#ifdef CONFIG_HAS_MEM_SHARING
+    case XEN_VM_EVENT_TYPE_SHARING:
+        return _VPF_mem_sharing;
+#endif
+    };
+
+    ASSERT_UNREACHABLE();
+    return -1;
+}
+
+#ifdef CONFIG_HAS_MEM_PAGING
+static void mem_paging_notification(struct vcpu *v, unsigned int port);
+#endif
+static void monitor_notification(struct vcpu *v, unsigned int port);
+#ifdef CONFIG_HAS_MEM_SHARING
+static void mem_sharing_notification(struct vcpu *v, unsigned int port);
+#endif
+
+static xen_event_channel_notification_t vm_event_notification_fn(uint32_t type)
+{
+    switch( type )
+    {
+#ifdef CONFIG_HAS_MEM_PAGING
+    case XEN_VM_EVENT_TYPE_PAGING:
+        return mem_paging_notification;
+#endif
+    case XEN_VM_EVENT_TYPE_MONITOR:
+        return monitor_notification;
+#ifdef CONFIG_HAS_MEM_SHARING
+    case XEN_VM_EVENT_TYPE_SHARING:
+        return mem_sharing_notification;
+#endif
+    };
+
+    ASSERT_UNREACHABLE();
+    return NULL;
+}
+
+/*
+ * VM event ring implementation;
+ */
+
 #define to_ring(_ved) container_of((_ved), struct vm_event_ring_domain, ved)
 
 /* VM event ring implementation */
@@ -67,12 +140,12 @@  static const struct vm_event_ops vm_event_ring_ops;
 static int vm_event_ring_enable(
     struct domain *d,
     struct xen_domctl_vm_event_op *vec,
-    struct vm_event_domain **p_ved,
-    int pause_flag,
-    int param,
-    xen_event_channel_notification_t notification_fn)
+    struct vm_event_domain **p_ved)
 {
     int rc;
+    int param = vm_event_ring_pfn_param(vec->type);
+    int pause_flag = vm_event_pause_flag(vec->type);
+    xen_event_channel_notification_t fn = vm_event_notification_fn(vec->type);
     unsigned long ring_gfn = d->arch.hvm.params[param];
     struct vm_event_ring_domain *impl;
 
@@ -111,8 +184,7 @@  static int vm_event_ring_enable(
                     (vm_event_sring_t *)impl->ring_page,
                     PAGE_SIZE);
 
-    rc = alloc_unbound_xen_event_channel(d, 0, current->domain->domain_id,
-                                         notification_fn);
+    rc = alloc_unbound_xen_event_channel(d, 0, current->domain->domain_id, fn);
     if ( rc < 0 )
         goto err;
 
@@ -242,6 +314,7 @@  static int vm_event_ring_disable(struct vm_event_domain **p_ved)
 
     xfree(impl);
     *p_ved = NULL;
+
     return 0;
 }
 
@@ -365,6 +438,51 @@  static int vm_event_ring_get_response(struct vm_event_ring_domain *impl,
     return rc;
 }
 
+static void vm_event_handle_response(struct domain *d, struct vcpu *v,
+                                     vm_event_response_t *rsp)
+{
+    /* Check flags which apply only when the vCPU is paused */
+    if ( atomic_read(&v->vm_event_pause_count) )
+    {
+#ifdef CONFIG_HAS_MEM_PAGING
+        if ( rsp->reason == VM_EVENT_REASON_MEM_PAGING )
+            p2m_mem_paging_resume(d, rsp);
+#endif
+
+        /*
+         * Check emulation flags in the arch-specific handler only, as it
+         * has to set arch-specific flags when supported, and to avoid
+         * bitmask overhead when it isn't supported.
+         */
+        vm_event_emulate_check(v, rsp);
+
+        /*
+         * Check in arch-specific handler to avoid bitmask overhead when
+         * not supported.
+         */
+        vm_event_register_write_resume(v, rsp);
+
+        /*
+         * Check in arch-specific handler to avoid bitmask overhead when
+         * not supported.
+         */
+        vm_event_toggle_singlestep(d, v, rsp);
+
+        /* Check for altp2m switch */
+        if ( rsp->flags & VM_EVENT_FLAG_ALTERNATE_P2M )
+            p2m_altp2m_check(v, rsp->altp2m_idx);
+
+        if ( rsp->flags & VM_EVENT_FLAG_SET_REGISTERS )
+            vm_event_set_registers(v, rsp);
+
+        if ( rsp->flags & VM_EVENT_FLAG_GET_NEXT_INTERRUPT )
+            vm_event_monitor_next_interrupt(v);
+
+        if ( rsp->flags & VM_EVENT_FLAG_VCPU_PAUSED )
+            vm_event_vcpu_unpause(v);
+    }
+}
+
 /*
  * Pull all responses from the given ring and unpause the corresponding vCPU
  * if required. Based on the response type, here we can also call custom
@@ -373,22 +491,20 @@  static int vm_event_ring_get_response(struct vm_event_ring_domain *impl,
  * Note: responses are handled the same way regardless of which ring they
  * arrive on.
  */
-static int vm_event_ring_resume(struct vm_event_ring_domain *impl)
+static int vm_event_ring_resume(struct vm_event_domain *ved, struct vcpu *v)
 {
     vm_event_response_t rsp;
-
-    if ( unlikely(!impl || !vm_event_check(&impl->ved)) )
-         return -ENODEV;
+    struct vm_event_ring_domain *impl = to_ring(ved);
 
     /*
-     * vm_event_resume() runs in either XEN_VM_EVENT_* domctls, or
+     * vm_event_ring_resume() runs in either XEN_VM_EVENT_* domctls, or
      * EVTCHN_send context from the introspection consumer. Both contexts
      * are guaranteed not to be the subject of vm_event responses.
      * While we could ASSERT(v != current) for each VCPU in d in the loop
      * below, this covers the case where we would need to iterate over all
      * of them more succintly.
      */
-    ASSERT(impl->ved.d != current->domain);
+    ASSERT(ved->d != current->domain);
 
     /* Pull all responses off the ring. */
     while ( vm_event_ring_get_response(impl, &rsp) )
@@ -402,7 +518,7 @@  static int vm_event_ring_resume(struct vm_event_ring_domain *impl)
         }
 
         /* Validate the vcpu_id in the response. */
-        v = domain_vcpu(impl->ved.d, rsp.vcpu_id);
+        v = domain_vcpu(ved->d, rsp.vcpu_id);
         if ( !v )
             continue;
 
@@ -410,47 +526,7 @@  static int vm_event_ring_resume(struct vm_event_ring_domain *impl)
          * In some cases the response type needs extra handling, so here
          * we call the appropriate handlers.
          */
-
-        /* Check flags which apply only when the vCPU is paused */
-        if ( atomic_read(&v->vm_event_pause_count) )
-        {
-#ifdef CONFIG_HAS_MEM_PAGING
-            if ( rsp.reason == VM_EVENT_REASON_MEM_PAGING )
-                p2m_mem_paging_resume(impl->ved.d, &rsp);
-#endif
-
-            /*
-             * Check emulation flags in the arch-specific handler only, as it
-             * has to set arch-specific flags when supported, and to avoid
-             * bitmask overhead when it isn't supported.
-             */
-            vm_event_emulate_check(v, &rsp);
-
-            /*
-             * Check in arch-specific handler to avoid bitmask overhead when
-             * not supported.
-             */
-            vm_event_register_write_resume(v, &rsp);
-
-            /*
-             * Check in arch-specific handler to avoid bitmask overhead when
-             * not supported.
-             */
-            vm_event_toggle_singlestep(impl->ved.d, v, &rsp);
-
-            /* Check for altp2m switch */
-            if ( rsp.flags & VM_EVENT_FLAG_ALTERNATE_P2M )
-                p2m_altp2m_check(v, rsp.altp2m_idx);
-
-            if ( rsp.flags & VM_EVENT_FLAG_SET_REGISTERS )
-                vm_event_set_registers(v, &rsp);
-
-            if ( rsp.flags & VM_EVENT_FLAG_GET_NEXT_INTERRUPT )
-                vm_event_monitor_next_interrupt(v);
-
-            if ( rsp.flags & VM_EVENT_FLAG_VCPU_PAUSED )
-                vm_event_vcpu_unpause(v);
-        }
+        vm_event_handle_response(ved->d, v, &rsp);
     }
 
     return 0;
@@ -535,59 +611,361 @@  static int vm_event_ring_claim_slot(struct vm_event_domain *ved, bool allow_slee
         return vm_event_ring_grab_slot(to_ring(ved), current->domain != ved->d);
 }
 
-#ifdef CONFIG_HAS_MEM_PAGING
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void mem_paging_notification(struct vcpu *v, unsigned int port)
+static void vm_event_ring_cleanup(struct vm_event_domain *ved)
 {
-    vm_event_ring_resume(to_ring(v->domain->vm_event_paging));
+    struct vm_event_ring_domain *impl = to_ring(ved);
+    /* Destroying the wait queue head means waking up all
+     * queued vcpus. This will drain the list, allowing
+     * the disable routine to complete. It will also drop
+     * all domain refs the wait-queued vcpus are holding.
+     * Finally, because this code path involves previously
+     * pausing the domain (domain_kill), unpausing the
+     * vcpus causes no harm. */
+    destroy_waitqueue_head(&impl->wq);
 }
-#endif
 
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void monitor_notification(struct vcpu *v, unsigned int port)
+/*
+ * VM event NG (new generation)
+ */
+#define to_channels(_ved) container_of((_ved), \
+                                        struct vm_event_channels_domain, ved)
+
+struct vm_event_channels_domain
+{
+    /* VM event domain */
+    struct vm_event_domain ved;
+    /* shared channels buffer */
+    struct vm_event_slot *slots;
+    /* the buffer size (number of frames) */
+    unsigned int nr_frames;
+    /* buffer's mnf list */
+    mfn_t mfn[0];
+};
+
+static const struct vm_event_ops vm_event_channels_ops;
+
+static void vm_event_channels_free_buffer(struct vm_event_channels_domain *impl)
 {
-    vm_event_ring_resume(to_ring(v->domain->vm_event_monitor));
+    int i;
+
+    vunmap(impl->slots);
+    impl->slots = NULL;
+
+    for ( i = 0; i < impl->nr_frames; i++ )
+        free_domheap_page(mfn_to_page(impl->mfn[i]));
 }
 
-#ifdef CONFIG_HAS_MEM_SHARING
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void mem_sharing_notification(struct vcpu *v, unsigned int port)
+static int vm_event_channels_alloc_buffer(struct vm_event_channels_domain *impl)
 {
-    vm_event_ring_resume(to_ring(v->domain->vm_event_share));
+    int i = 0;
+
+    impl->slots = vzalloc(impl->nr_frames * PAGE_SIZE);
+    if ( !impl->slots )
+        return -ENOMEM;
+
+    for ( i = 0; i < impl->nr_frames; i++ )
+        impl->mfn[i] = vmap_to_mfn(impl->slots + i * PAGE_SIZE);
+
+    for ( i = 0; i < impl->nr_frames; i++ )
+        share_xen_page_with_guest(mfn_to_page(impl->mfn[i]), current->domain,
+                                  SHARE_rw);
+
+    return 0;
 }
+
+static int vm_event_channels_enable(
+    struct domain *d,
+    struct xen_domctl_vm_event_op *vec,
+    struct vm_event_domain **p_ved)
+{
+    int rc, i = 0;
+    xen_event_channel_notification_t fn = vm_event_notification_fn(vec->type);
+    unsigned int nr_frames = PFN_UP(d->max_vcpus * sizeof(struct vm_event_slot));
+    struct vm_event_channels_domain *impl;
+
+    if ( *p_ved )
+        return -EBUSY;
+
+    impl = _xzalloc(sizeof(struct vm_event_channels_domain) +
+                           nr_frames * sizeof(mfn_t),
+                    __alignof__(struct vm_event_channels_domain));
+    if ( unlikely(!impl) )
+        return -ENOMEM;
+
+    spin_lock_init(&impl->ved.lock);
+
+    impl->nr_frames = nr_frames;
+    impl->ved.d = d;
+    impl->ved.ops = &vm_event_channels_ops;
+
+    rc = vm_event_init_domain(d);
+    if ( rc < 0 )
+        goto err;
+
+    rc = vm_event_channels_alloc_buffer(impl);
+    if ( rc )
+        goto err;
+
+    for ( i = 0; i < d->max_vcpus; i++ )
+    {
+        rc = alloc_unbound_xen_event_channel(d, i, current->domain->domain_id, fn);
+        if ( rc < 0 )
+            goto err;
+
+        impl->slots[i].port = rc;
+        impl->slots[i].state = STATE_VM_EVENT_SLOT_IDLE;
+    }
+
+    *p_ved = &impl->ved;
+
+    return 0;
+
+err:
+    while ( --i >= 0 )
+        evtchn_close(d, impl->slots[i].port, 0);
+    xfree(impl);
+
+    return rc;
+}
+
+static int vm_event_channels_disable(struct vm_event_domain **p_ved)
+{
+    struct vcpu *v;
+    struct domain *d = (*p_ved)->d;
+    struct vm_event_channels_domain *impl = to_channels(*p_ved);
+    int i;
+
+    spin_lock(&impl->ved.lock);
+
+    for_each_vcpu( impl->ved.d, v )
+    {
+        if ( atomic_read(&v->vm_event_pause_count) )
+            vm_event_vcpu_unpause(v);
+    }
+
+    for ( i = 0; i < impl->ved.d->max_vcpus; i++ )
+        evtchn_close(impl->ved.d, impl->slots[i].port, 0);
+
+    vm_event_channels_free_buffer(impl);
+
+    vm_event_cleanup_domain(d);
+
+    spin_unlock(&impl->ved.lock);
+
+    xfree(impl);
+    *p_ved = NULL;
+
+    return 0;
+}
+
+static bool vm_event_channels_check(struct vm_event_domain *ved)
+{
+    return to_channels(ved)->slots != NULL;
+}
+
+static void vm_event_channels_cleanup(struct vm_event_domain *ved)
+{
+}
+
+static int vm_event_channels_claim_slot(struct vm_event_domain *ved,
+                                        bool allow_sleep)
+{
+    return 0;
+}
+
+static void vm_event_channels_cancel_slot(struct vm_event_domain *ved)
+{
+}
+
+static void vm_event_channels_put_request(struct vm_event_domain *ved,
+                                          vm_event_request_t *req)
+{
+    struct vm_event_channels_domain *impl = to_channels(ved);
+    struct vm_event_slot *slot;
+
+    ASSERT( req->vcpu_id >= 0 && req->vcpu_id < ved->d->max_vcpus );
+
+    slot = &impl->slots[req->vcpu_id];
+
+    if ( current->domain != ved->d )
+    {
+        req->flags |= VM_EVENT_FLAG_FOREIGN;
+#ifndef NDEBUG
+        if ( !(req->flags & VM_EVENT_FLAG_VCPU_PAUSED) )
+            gdprintk(XENLOG_G_WARNING, "d%dv%d was not paused.\n",
+                     ved->d->domain_id, req->vcpu_id);
 #endif
+    }
+
+    req->version = VM_EVENT_INTERFACE_VERSION;
+
+    spin_lock(&impl->ved.lock);
+    if ( slot->state != STATE_VM_EVENT_SLOT_IDLE )
+    {
+        gdprintk(XENLOG_G_WARNING, "The VM event slot for d%dv%d is not IDLE.\n",
+                 impl->ved.d->domain_id, req->vcpu_id);
+        spin_unlock(&impl->ved.lock);
+        return;
+    }
+
+    slot->u.req = *req;
+    slot->state = STATE_VM_EVENT_SLOT_SUBMIT;
+    spin_unlock(&impl->ved.lock);
+    notify_via_xen_event_channel(impl->ved.d, slot->port);
+}
+
+static int vm_event_channels_get_response(struct vm_event_channels_domain *impl,
+                                          struct vcpu *v, vm_event_response_t *rsp)
+{
+    struct vm_event_slot *slot = &impl->slots[v->vcpu_id];
+    int rc = 0;
+
+    ASSERT( slot != NULL );
+    spin_lock(&impl->ved.lock);
+
+    if ( slot->state != STATE_VM_EVENT_SLOT_FINISH )
+    {
+        gdprintk(XENLOG_G_WARNING, "The VM event slot state for d%dv%d is invalid.\n",
+                 impl->ved.d->domain_id, v->vcpu_id);
+        rc = -1;
+        goto out;
+    }
+
+    *rsp = slot->u.rsp;
+    slot->state = STATE_VM_EVENT_SLOT_IDLE;
+
+out:
+    spin_unlock(&impl->ved.lock);
+
+    return rc;
+}
+
+static int vm_event_channels_resume(struct vm_event_domain *ved, struct vcpu *v)
+{
+    vm_event_response_t rsp;
+    struct vm_event_channels_domain *impl = to_channels(ved);
+
+    ASSERT(ved->d != current->domain);
+
+    if ( vm_event_channels_get_response(impl, v, &rsp) ||
+         rsp.version != VM_EVENT_INTERFACE_VERSION ||
+         rsp.vcpu_id != v->vcpu_id )
+        return -1;
+
+    vm_event_handle_response(ved->d, v, &rsp);
+
+    return 0;
+}
+
+int vm_event_ng_get_frames(struct domain *d, unsigned int id,
+                           unsigned long frame, unsigned int nr_frames,
+                           xen_pfn_t mfn_list[])
+{
+    struct vm_event_domain *ved;
+    int i;
+
+    switch (id )
+    {
+    case XEN_VM_EVENT_TYPE_MONITOR:
+        ved = d->vm_event_monitor;
+        break;
+
+    default:
+        return -ENOSYS;
+    }
+
+    if ( !vm_event_check(ved) )
+        return -EINVAL;
+
+    if ( frame != 0 || nr_frames != to_channels(ved)->nr_frames )
+        return -EINVAL;
+
+    spin_lock(&ved->lock);
+
+    for ( i = 0; i < to_channels(ved)->nr_frames; i++ )
+        mfn_list[i] = mfn_x(to_channels(ved)->mfn[i]);
+
+    spin_unlock(&ved->lock);
+
+    return 0;
+}
+
+/*
+ * vm_event implementation agnostic functions
+ */
 
 /* Clean up on domain destruction */
 void vm_event_cleanup(struct domain *d)
 {
 #ifdef CONFIG_HAS_MEM_PAGING
     if ( vm_event_check(d->vm_event_paging) )
-        d->vm_event_paging->ops->cleanup(&d->vm_event_paging);
+    {
+        d->vm_event_paging->ops->cleanup(d->vm_event_paging);
+        d->vm_event_paging->ops->disable(&d->vm_event_paging);
+    }
 #endif
 
     if ( vm_event_check(d->vm_event_monitor) )
-        d->vm_event_monitor->ops->cleanup(&d->vm_event_monitor);
+    {
+        d->vm_event_monitor->ops->cleanup(d->vm_event_monitor);
+        d->vm_event_monitor->ops->disable(&d->vm_event_monitor);
+    }
 
 #ifdef CONFIG_HAS_MEM_SHARING
     if ( vm_event_check(d->vm_event_share) )
-        d->vm_event_share->ops->cleanup(&d->vm_event_share);
+    {
+        d->vm_event_share->ops->cleanup(d->vm_event_share);
+        d->vm_event_share->ops->disable(&d->vm_event_share);
+    }
 #endif
 }
 
-static void vm_event_ring_cleanup(struct vm_event_domain **_ved)
+static int vm_event_enable(struct domain *d,
+                           struct xen_domctl_vm_event_op *vec,
+                           struct vm_event_domain **p_ved)
 {
-    struct vm_event_ring_domain *impl = to_ring(*_ved);
-    /* Destroying the wait queue head means waking up all
-     * queued vcpus. This will drain the list, allowing
-     * the disable routine to complete. It will also drop
-     * all domain refs the wait-queued vcpus are holding.
-     * Finally, because this code path involves previously
-     * pausing the domain (domain_kill), unpausing the
-     * vcpus causes no harm. */
-    destroy_waitqueue_head(&impl->wq);
-    (void)vm_event_ring_disable(_ved);
+    return ( vec->flags & XEN_VM_EVENT_FLAGS_NG_OP ) ?
+        vm_event_channels_enable(d, vec, p_ved) :
+        vm_event_ring_enable(d, vec, p_ved);
 }
 
+static int vm_event_resume(struct vm_event_domain *ved, struct vcpu *v)
+{
+    if ( !vm_event_check(ved) )
+         return -ENODEV;
+
+    if ( !v )
+        return -EINVAL;
+
+    return ved->ops->resume(ved, v);
+}
+
+#ifdef CONFIG_HAS_MEM_PAGING
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void mem_paging_notification(struct vcpu *v, unsigned int port)
+{
+    vm_event_resume(v->domain->vm_event_paging, v);
+}
+#endif
+
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void monitor_notification(struct vcpu *v, unsigned int port)
+{
+    vm_event_resume(v->domain->vm_event_monitor, v);
+}
+
+#ifdef CONFIG_HAS_MEM_SHARING
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void mem_sharing_notification(struct vcpu *v, unsigned int port)
+{
+    vm_event_resume(v->domain->vm_event_share, v);
+}
+#endif
+
+/*
+ * vm_event domctl interface
+ */
+
 int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
 {
     int rc;
@@ -632,6 +1010,13 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
     {
         rc = -EINVAL;
 
+        /*
+         * The NG interface is only supported by XEN_VM_EVENT_TYPE_MONITOR
+         * for now.
+         */
+        if ( vec->flags & XEN_VM_EVENT_FLAGS_NG_OP )
+            break;
+
         switch( vec->op )
         {
         case XEN_VM_EVENT_ENABLE:
@@ -657,9 +1042,7 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
                 break;
 
             /* domain_pause() not required here, see XSA-99 */
-            rc = vm_event_ring_enable(d, vec, &d->vm_event_paging, _VPF_mem_paging,
-                                 HVM_PARAM_PAGING_RING_PFN,
-                                 mem_paging_notification);
+            rc = vm_event_enable(d, vec, &d->vm_event_paging);
         }
         break;
 
@@ -667,12 +1050,13 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
             if ( !vm_event_check(d->vm_event_paging) )
                 break;
             domain_pause(d);
-            rc = vm_event_ring_disable(&d->vm_event_paging);
+            rc = d->vm_event_paging->ops->disable(&d->vm_event_paging);
             domain_unpause(d);
             break;
 
         case XEN_VM_EVENT_RESUME:
-            rc = vm_event_ring_resume(to_ring(d->vm_event_paging));
+            rc = vm_event_resume(d->vm_event_paging,
+                                 domain_vcpu(d, vec->u.resume.vcpu_id));
             break;
 
         default:
@@ -694,22 +1078,23 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
             rc = arch_monitor_init_domain(d);
             if ( rc )
                 break;
-            rc = vm_event_ring_enable(d, vec, &d->vm_event_monitor, _VPF_mem_access,
-                                 HVM_PARAM_MONITOR_RING_PFN,
-                                 monitor_notification);
+
+            rc = vm_event_enable(d, vec, &d->vm_event_monitor);
+
             break;
 
         case XEN_VM_EVENT_DISABLE:
             if ( !vm_event_check(d->vm_event_monitor) )
                 break;
             domain_pause(d);
-            rc = vm_event_ring_disable(&d->vm_event_monitor);
+            rc = d->vm_event_monitor->ops->disable(&d->vm_event_monitor);
             arch_monitor_cleanup_domain(d);
             domain_unpause(d);
             break;
 
         case XEN_VM_EVENT_RESUME:
-            rc = vm_event_ring_resume(to_ring(d->vm_event_monitor));
+            rc = vm_event_resume(d->vm_event_monitor,
+                                 domain_vcpu(d, vec->u.resume.vcpu_id));
             break;
 
         default:
@@ -724,6 +1109,13 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
     {
         rc = -EINVAL;
 
+        /*
+         * The NG interface is only supported by XEN_VM_EVENT_TYPE_MONITOR
+         * for now.
+         */
+        if ( vec->flags & XEN_VM_EVENT_FLAGS_NG_OP )
+            break;
+
         switch( vec->op )
         {
         case XEN_VM_EVENT_ENABLE:
@@ -738,21 +1130,20 @@  int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
                 break;
 
             /* domain_pause() not required here, see XSA-99 */
-            rc = vm_event_ring_enable(d, vec, &d->vm_event_share, _VPF_mem_sharing,
-                                 HVM_PARAM_SHARING_RING_PFN,
-                                 mem_sharing_notification);
+            rc = vm_event_enable(d, vec, &d->vm_event_share);
             break;
 
         case XEN_VM_EVENT_DISABLE:
             if ( !vm_event_check(d->vm_event_share) )
                 break;
             domain_pause(d);
-            rc = vm_event_ring_disable(&d->vm_event_share);
+            rc = d->vm_event_share->ops->disable(&d->vm_event_share);
             domain_unpause(d);
             break;
 
         case XEN_VM_EVENT_RESUME:
-            rc = vm_event_ring_resume(to_ring(d->vm_event_share));
+            rc = vm_event_resume(d->vm_event_share,
+                                 domain_vcpu(d, vec->u.resume.vcpu_id));
             break;
 
         default:
@@ -809,7 +1200,19 @@  static const struct vm_event_ops vm_event_ring_ops = {
     .cleanup = vm_event_ring_cleanup,
     .claim_slot = vm_event_ring_claim_slot,
     .cancel_slot = vm_event_ring_cancel_slot,
-    .put_request = vm_event_ring_put_request
+    .disable = vm_event_ring_disable,
+    .put_request = vm_event_ring_put_request,
+    .resume = vm_event_ring_resume,
+};
+
+static const struct vm_event_ops vm_event_channels_ops = {
+    .check = vm_event_channels_check,
+    .cleanup = vm_event_channels_cleanup,
+    .claim_slot = vm_event_channels_claim_slot,
+    .cancel_slot = vm_event_channels_cancel_slot,
+    .disable = vm_event_channels_disable,
+    .put_request = vm_event_channels_put_request,
+    .resume = vm_event_channels_resume,
 };
 
 /*
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 234d8c5..fc7420c 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -38,7 +38,7 @@ 
 #include "hvm/save.h"
 #include "memory.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000011
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000012
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -781,12 +781,20 @@  struct xen_domctl_gdbsx_domstatus {
 struct xen_domctl_vm_event_op {
     uint32_t       op;           /* XEN_VM_EVENT_* */
     uint32_t       type;         /* XEN_VM_EVENT_TYPE_* */
+ /* Use the NG interface */
+#define _XEN_VM_EVENT_FLAGS_NG_OP         0
+#define XEN_VM_EVENT_FLAGS_NG_OP          (1U << _XEN_VM_EVENT_FLAGS_NG_OP)
+    uint32_t       flags;
 
     union {
         struct {
             uint32_t port;       /* OUT: event channel for ring */
         } enable;
 
+        struct {
+            uint32_t vcpu_id;    /* IN: vcpu_id*/
+        } resume;
+
         uint32_t version;
     } u;
 };
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 68ddadb..2e8912e 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -612,6 +612,7 @@  struct xen_mem_acquire_resource {
 
 #define XENMEM_resource_ioreq_server 0
 #define XENMEM_resource_grant_table 1
+#define XENMEM_resource_vm_event 2
 
     /*
      * IN - a type-specific resource identifier, which must be zero
@@ -619,6 +620,7 @@  struct xen_mem_acquire_resource {
      *
      * type == XENMEM_resource_ioreq_server -> id == ioreq server id
      * type == XENMEM_resource_grant_table -> id defined below
+     * type == XENMEM_resource_vm_event -> id == vm_event type
      */
     uint32_t id;
 
diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
index c48bc21..2f2160b 100644
--- a/xen/include/public/vm_event.h
+++ b/xen/include/public/vm_event.h
@@ -421,6 +421,22 @@  typedef struct vm_event_st {
 
 DEFINE_RING_TYPES(vm_event, vm_event_request_t, vm_event_response_t);
 
+/* VM Event slot state */
+#define STATE_VM_EVENT_SLOT_IDLE     0 /* the slot data is invalid */
+#define STATE_VM_EVENT_SLOT_SUBMIT   1 /* a request was submitted */
+#define STATE_VM_EVENT_SLOT_FINISH   2 /* a response was issued */
+
+struct vm_event_slot
+{
+    uint32_t port;      /* evtchn for notifications to/from helper */
+    uint32_t state:4;
+    uint32_t pad:28;
+    union {
+        vm_event_request_t req;
+        vm_event_response_t rsp;
+    } u;
+};
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 #endif /* _XEN_PUBLIC_VM_EVENT_H */
 
diff --git a/xen/include/xen/vm_event.h b/xen/include/xen/vm_event.h
index 21a3f50..0468269 100644
--- a/xen/include/xen/vm_event.h
+++ b/xen/include/xen/vm_event.h
@@ -30,14 +30,17 @@ 
 
 struct domain;
 struct vm_event_domain;
+struct xen_domctl_vm_event_op;
 
 struct vm_event_ops
 {
     bool (*check)(struct vm_event_domain *ved);
-    void (*cleanup)(struct vm_event_domain **_ved);
-    int (*claim_slot)(struct vm_event_domain *ved, bool allow_sleep);
+    void (*cleanup)(struct vm_event_domain *p_ved);
+    int  (*claim_slot)(struct vm_event_domain *ved, bool allow_sleep);
     void (*cancel_slot)(struct vm_event_domain *ved);
+    int  (*disable)(struct vm_event_domain **p_ved);
     void (*put_request)(struct vm_event_domain *ved, vm_event_request_t *req);
+    int  (*resume)(struct vm_event_domain *ved, struct vcpu *v);
 };
 
 struct vm_event_domain
@@ -111,6 +114,10 @@  static inline void vm_event_put_request(struct vm_event_domain *ved,
 
 int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec);
 
+int vm_event_ng_get_frames(struct domain *d, unsigned int id,
+                           unsigned long frame, unsigned int nr_frames,
+                           xen_pfn_t mfn_list[]);
+
 void vm_event_vcpu_pause(struct vcpu *v);
 void vm_event_vcpu_unpause(struct vcpu *v);