x86/hvm: Allow the guest to permit the use of userspace hypercalls
diff mbox

Message ID 1452520774-16794-1-git-send-email-andrew.cooper3@citrix.com
State New, archived
Headers show

Commit Message

Andrew Cooper Jan. 11, 2016, 1:59 p.m. UTC
Currently, hypercalls issued from HVM userspace will unconditionally fail with
-EPERM.

This is inflexible, and a guest may wish to allow userspace to make
hypercalls.

Introduce HVMOP_set_hypercall_dpl which allows the guest to alter the
permissions check for hypercalls.  It behaves exactly like the dpl field for
GDT/LDT/IDT entries.

As the dpl is initialised to 0, hypercalls are restricted to cpl0 code until
the OS explicitly chooses an alternative.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
--
CC: Jan Beulich <JBeulich@suse.com>
CC: Ian Campbell <ian.campbell@citrix.com>
CC: Stefano Stabellini <stefano.stabellini@citrix.com>

Arm folks: Is something like this sufficiently generic to be useful on Arm,
perhaps with more generic naming?

PV guest support for userspace hypercalls is substantially more involved, and
will take longer to complete.
---
 xen/arch/x86/hvm/hvm.c           | 25 ++++++++++++++++++++++++-
 xen/include/asm-x86/hvm/domain.h |  2 ++
 xen/include/public/hvm/hvm_op.h  |  8 ++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

Comments

Paul Durrant Jan. 11, 2016, 2:32 p.m. UTC | #1
> -----Original Message-----
> From: xen-devel-bounces@lists.xen.org [mailto:xen-devel-
> bounces@lists.xen.org] On Behalf Of Andrew Cooper
> Sent: 11 January 2016 14:00
> To: Xen-devel
> Cc: Andrew Cooper; Stefano Stabellini; Ian Campbell; Jan Beulich
> Subject: [Xen-devel] [PATCH] x86/hvm: Allow the guest to permit the use of
> userspace hypercalls
> 
> Currently, hypercalls issued from HVM userspace will unconditionally fail with
> -EPERM.
> 
> This is inflexible, and a guest may wish to allow userspace to make
> hypercalls.
> 
> Introduce HVMOP_set_hypercall_dpl which allows the guest to alter the
> permissions check for hypercalls.  It behaves exactly like the dpl field for
> GDT/LDT/IDT entries.
> 
> As the dpl is initialised to 0, hypercalls are restricted to cpl0 code until
> the OS explicitly chooses an alternative.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> --
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Ian Campbell <ian.campbell@citrix.com>
> CC: Stefano Stabellini <stefano.stabellini@citrix.com>
> 
> Arm folks: Is something like this sufficiently generic to be useful on Arm,
> perhaps with more generic naming?
> 
> PV guest support for userspace hypercalls is substantially more involved, and
> will take longer to complete.
> ---
>  xen/arch/x86/hvm/hvm.c           | 25 ++++++++++++++++++++++++-
>  xen/include/asm-x86/hvm/domain.h |  2 ++
>  xen/include/public/hvm/hvm_op.h  |  8 ++++++++
>  3 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 21470ec..e5a08db 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -5228,7 +5228,8 @@ int hvm_do_hypercall(struct cpu_user_regs *regs)
>      case 4:
>      case 2:
>          hvm_get_segment_register(curr, x86_seg_ss, &sreg);
> -        if ( unlikely(sreg.attr.fields.dpl) )
> +        if ( unlikely(sreg.attr.fields.dpl <
> +                      currd->arch.hvm_domain.hypercall_dpl) )

Either I'm going mad, or that check should be '>' shouldn't it? (After all the test is currently effectively a test for > 0)

>          {
>      default:
>              regs->eax = -EPERM;
> @@ -6839,6 +6840,28 @@ long do_hvm_op(unsigned long op,
> XEN_GUEST_HANDLE_PARAM(void) arg)
>          rc = do_altp2m_op(arg);
>          break;
> 
> +    case HVMOP_set_hypercall_dpl:
> +    {
> +        xen_hvm_hypercall_dpl_t a;
> +        struct domain *d;
> +
> +        if ( copy_from_guest(&a, arg, 1 ) )
> +            return -EFAULT;
> +
> +        rc = rcu_lock_remote_domain_by_id(a.domid, &d);
> +        if ( rc != 0 )
> +            return rc;
> +
> +        if ( current->domain != d )
> +            return -EPERM;
> +
> +        if ( !is_hvm_domain(d) || a.dpl > 3 )
> +            return -EINVAL;
> +
> +        d->arch.hvm_domain.hypercall_dpl = a.dpl;
> +        break;
> +    }
> +
>      default:
>      {
>          gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op);
> diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-
> x86/hvm/domain.h
> index a8cc2ad..006a142 100644
> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -137,6 +137,8 @@ struct hvm_domain {
>      bool_t                 qemu_mapcache_invalidate;
>      bool_t                 is_s3_suspended;
> 
> +    uint32_t               hypercall_dpl;
> +

Why burn 32-bits for a value that's only 2 bits wide?

  Paul

>      /*
>       * TSC value that VCPUs use to calculate their tsc_offset value.
>       * Used during initialization and save/restore.
> diff --git a/xen/include/public/hvm/hvm_op.h
> b/xen/include/public/hvm/hvm_op.h
> index 1606185..f8247db 100644
> --- a/xen/include/public/hvm/hvm_op.h
> +++ b/xen/include/public/hvm/hvm_op.h
> @@ -489,6 +489,14 @@ struct xen_hvm_altp2m_op {
>  typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
> 
> +#define HVMOP_set_hypercall_dpl 26
> +struct xen_hvm_hypercall_dpl {
> +    domid_t domid;
> +    uint16_t dpl;  /* IN[1:0] cpl required to make hypercalls. */
> +};
> +typedef struct xen_hvm_hypercall_dpl xen_hvm_hypercall_dpl_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_hvm_hypercall_dpl_t);
> +
>  #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
> 
>  /*
> --
> 2.1.4
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
Jan Beulich Jan. 11, 2016, 2:44 p.m. UTC | #2
>>> On 11.01.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
> Currently, hypercalls issued from HVM userspace will unconditionally fail
> with -EPERM.
> 
> This is inflexible, and a guest may wish to allow userspace to make
> hypercalls.

I thought previous discussion had made clear that routing these
through ioctls or alike is the right approach, and hence the patch
isn't needed. The more that an all-or-nothing approach seems
pretty bold.

> @@ -6839,6 +6840,28 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
>          rc = do_altp2m_op(arg);
>          break;
>  
> +    case HVMOP_set_hypercall_dpl:
> +    {
> +        xen_hvm_hypercall_dpl_t a;
> +        struct domain *d;
> +
> +        if ( copy_from_guest(&a, arg, 1 ) )
> +            return -EFAULT;
> +
> +        rc = rcu_lock_remote_domain_by_id(a.domid, &d);
> +        if ( rc != 0 )
> +            return rc;
> +
> +        if ( current->domain != d )
> +            return -EPERM;
> +
> +        if ( !is_hvm_domain(d) || a.dpl > 3 )
> +            return -EINVAL;

-EDOM perhaps for the right side?

> diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
> index a8cc2ad..006a142 100644
> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -137,6 +137,8 @@ struct hvm_domain {
>      bool_t                 qemu_mapcache_invalidate;
>      bool_t                 is_s3_suspended;
>  
> +    uint32_t               hypercall_dpl;

uint8_t ?

Jan
Andrew Cooper Jan. 11, 2016, 5:17 p.m. UTC | #3
On 11/01/16 14:44, Jan Beulich wrote:
>>>> On 11.01.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
>> Currently, hypercalls issued from HVM userspace will unconditionally fail
>> with -EPERM.
>>
>> This is inflexible, and a guest may wish to allow userspace to make
>> hypercalls.
> I thought previous discussion had made clear that routing these
> through ioctls or alike is the right approach, and hence the patch
> isn't needed. The more that an all-or-nothing approach seems
> pretty bold.

All other issues fixed in v2, but to answer this one specifically.

In it inappropriate for Xen to presume that all guests want Linux-like
handing of situations like this.  It is simply not true.

As part of getting my test framework ready to publish, I attempted to
port my XSA-106 unit tests to PV guests.  I have shelved that work as I
don't have sufficient time to fix PV trap handing in Xen at this present
time, but do plan to fix them in due course.

The bugs I have identified so far are:
* "INT n" handling assumes the instruction was 2 bytes long
* In some circumstances, Xen crashes the domain rather than injecting
#NP[sel]
* In most circumstances, Xen delivers #GP[sel] where #NP[sel] would be
correct
* Not possible to have non-dpl3 descriptors for #BP and #OF
* Not possible to mark an existing descriptor as not-present

All of these bugs exist because Xen PV was co-developed against Linux
without considerations towards a clean API/ABI.  It also means that any
PV guest wanting to have a non-Linux setup won't function in expected ways.

So from one point of view, sufficient justification for this change is
"because the Linux way isn't the only valid way to do this".

~Andrew
David Vrabel Jan. 11, 2016, 6:26 p.m. UTC | #4
On 11/01/16 17:17, Andrew Cooper wrote:
> So from one point of view, sufficient justification for this change is
> "because the Linux way isn't the only valid way to do this".

"Because we can" isn't a good justification for adding something new.
Particularly something that is trivially easy to (accidentally) misuse
and open a big security hole between userspace and kernel.

The vague idea for a userspace netfront that's floating around
internally is also not a good reason for pushing this feature at this time.

David
Andrew Cooper Jan. 11, 2016, 6:32 p.m. UTC | #5
On 11/01/16 18:26, David Vrabel wrote:
> On 11/01/16 17:17, Andrew Cooper wrote:
>> So from one point of view, sufficient justification for this change is
>> "because the Linux way isn't the only valid way to do this".
> "Because we can" isn't a good justification for adding something new.

"Because I need this to sensibly regression test bits of the hypervisor" is.

> Particularly something that is trivially easy to (accidentally) misuse
> and open a big security hole between userspace and kernel.

This is no conceptual difference to incorrectly updating a pagetable, or
having wrong dpl checks in the IDT.

An OS which doesn't use the hypercall can't shoot itself.  An OS which
does use it has plenty of other ways to accidentally compromise itself.

> The vague idea for a userspace netfront that's floating around
> internally is also not a good reason for pushing this feature at this time.

It is a valid (albeit unwise) alternate usecase, showing that the
concept isn't unique to my situation.

~Andrew
David Vrabel Jan. 11, 2016, 6:40 p.m. UTC | #6
On 11/01/16 18:32, Andrew Cooper wrote:
> On 11/01/16 18:26, David Vrabel wrote:
>> On 11/01/16 17:17, Andrew Cooper wrote:
>>> So from one point of view, sufficient justification for this change is
>>> "because the Linux way isn't the only valid way to do this".
>> "Because we can" isn't a good justification for adding something new.
> 
> "Because I need this to sensibly regression test bits of the hypervisor" is.

No.  Tests should not require a magic mode -- they should test the
existing ABIs guests actually use.

>> Particularly something that is trivially easy to (accidentally) misuse
>> and open a big security hole between userspace and kernel.
> 
> This is no conceptual difference to incorrectly updating a pagetable, or
> having wrong dpl checks in the IDT.

Yes there is.  This proposed ABI addition is impossible to use safely.

> An OS which doesn't use the hypercall can't shoot itself.  An OS which
> does use it has plenty of other ways to accidentally compromise itself.

This ABI allows /untrusted userspace/ to shoot the whole OS in the foot.
 It's quite different.

David
Andrew Cooper Jan. 11, 2016, 6:50 p.m. UTC | #7
On 11/01/16 18:40, David Vrabel wrote:
> On 11/01/16 18:32, Andrew Cooper wrote:
>> On 11/01/16 18:26, David Vrabel wrote:
>>> On 11/01/16 17:17, Andrew Cooper wrote:
>>>> So from one point of view, sufficient justification for this change is
>>>> "because the Linux way isn't the only valid way to do this".
>>> "Because we can" isn't a good justification for adding something new.
>> "Because I need this to sensibly regression test bits of the hypervisor" is.
> No.  Tests should not require a magic mode -- they should test the
> existing ABIs guests actually use.

It isn't a magic mode.

>
>>> Particularly something that is trivially easy to (accidentally) misuse
>>> and open a big security hole between userspace and kernel.
>> This is no conceptual difference to incorrectly updating a pagetable, or
>> having wrong dpl checks in the IDT.
> Yes there is.  This proposed ABI addition is impossible to use safely.

It is perfectly possible to use safely.  "Safe" is a matter of
perspective, and depends on the usecase.  My entire argument here is
that "The Linux way" isn’t the only way, and it is wrong of Xen to
enforce "the Linux way" as the only way.

>
>> An OS which doesn't use the hypercall can't shoot itself.  An OS which
>> does use it has plenty of other ways to accidentally compromise itself.
> This ABI allows /untrusted userspace/ to shoot the whole OS in the foot.
>  It's quite different.

Only if the OS chooses to permit this.  If an OS doesn't want itself to
be shot, it doesn't use this hypercall and everything works as before.

~Andrew
Jan Beulich Jan. 12, 2016, 7:33 a.m. UTC | #8
>>> On 11.01.16 at 18:17, <andrew.cooper3@citrix.com> wrote:
> On 11/01/16 14:44, Jan Beulich wrote:
>>>>> On 11.01.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
>>> Currently, hypercalls issued from HVM userspace will unconditionally fail
>>> with -EPERM.
>>>
>>> This is inflexible, and a guest may wish to allow userspace to make
>>> hypercalls.
>> I thought previous discussion had made clear that routing these
>> through ioctls or alike is the right approach, and hence the patch
>> isn't needed. The more that an all-or-nothing approach seems
>> pretty bold.
> 
> All other issues fixed in v2, but to answer this one specifically.
> 
> In it inappropriate for Xen to presume that all guests want Linux-like
> handing of situations like this.  It is simply not true.
> 
> As part of getting my test framework ready to publish, I attempted to
> port my XSA-106 unit tests to PV guests.  I have shelved that work as I
> don't have sufficient time to fix PV trap handing in Xen at this present
> time, but do plan to fix them in due course.
> 
> The bugs I have identified so far are:
> * "INT n" handling assumes the instruction was 2 bytes long
> * In some circumstances, Xen crashes the domain rather than injecting
> #NP[sel]
> * In most circumstances, Xen delivers #GP[sel] where #NP[sel] would be
> correct

All of these could be considered part of the awareness
requirements towards guests.

> * Not possible to have non-dpl3 descriptors for #BP and #OF

Actually the issue is broader I think (I've stumbled across this
limitation before, namely in the context of the #AC issue having
been the subject of a recent XSA) - you can't associate a DPL
with any exception vector.

> * Not possible to mark an existing descriptor as not-present

"Not easily possible" I suppose you mean, since one can by re-
writing the entire table.

> So from one point of view, sufficient justification for this change is
> "because the Linux way isn't the only valid way to do this".

But otoh bypassing a layer is generally a bad thing. Any "normal"
OS wouldn't want to allow user mode to do arbitrary things under
its feet. Any OS full trusting what would be user mode could as
well run applications in ring 0.

Plus - see privcmd - any OS wanting to expose hypercalls can do
so by whatever mechanism is appropriate in that OS.

Jan
Andrew Cooper Jan. 12, 2016, 10:57 a.m. UTC | #9
On 12/01/16 07:33, Jan Beulich wrote:
>>>> On 11.01.16 at 18:17, <andrew.cooper3@citrix.com> wrote:
>> On 11/01/16 14:44, Jan Beulich wrote:
>>>>>> On 11.01.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
>>>> Currently, hypercalls issued from HVM userspace will unconditionally fail
>>>> with -EPERM.
>>>>
>>>> This is inflexible, and a guest may wish to allow userspace to make
>>>> hypercalls.
>>> I thought previous discussion had made clear that routing these
>>> through ioctls or alike is the right approach, and hence the patch
>>> isn't needed. The more that an all-or-nothing approach seems
>>> pretty bold.
>> All other issues fixed in v2, but to answer this one specifically.
>>
>> In it inappropriate for Xen to presume that all guests want Linux-like
>> handing of situations like this.  It is simply not true.
>>
>> As part of getting my test framework ready to publish, I attempted to
>> port my XSA-106 unit tests to PV guests.  I have shelved that work as I
>> don't have sufficient time to fix PV trap handing in Xen at this present
>> time, but do plan to fix them in due course.
>>
>> The bugs I have identified so far are:
>> * "INT n" handling assumes the instruction was 2 bytes long
>> * In some circumstances, Xen crashes the domain rather than injecting
>> #NP[sel]
>> * In most circumstances, Xen delivers #GP[sel] where #NP[sel] would be
>> correct
> All of these could be considered part of the awareness
> requirements towards guests.

The first causes corruption of process state in circumstances which
wouldn't under native, including userspace state.

You could make that argument for the final two.  I reckon it is an
unreasonable requirement.

>
>> * Not possible to have non-dpl3 descriptors for #BP and #OF
> Actually the issue is broader I think (I've stumbled across this
> limitation before, namely in the context of the #AC issue having
> been the subject of a recent XSA) - you can't associate a DPL
> with any exception vector.

Ah - I had not patched Xen up sufficiently for the unit test to notice
this (the domain crashes rather than #NP exceptions were prohibitive).

>
>> * Not possible to mark an existing descriptor as not-present
> "Not easily possible" I suppose you mean, since one can by re-
> writing the entire table.

Can't be done atomically.  Even if interrupts are disabled, there is a
window where NMIs and MCEs would be lost.


Writing a PV guest from scratch has been very enlightening to
demonstrate how much of a trainwreck the ABI is.  Almost nothing is
documented.  Some bits which are documented are misleading.  Several
areas needlessly deviate from x86 architectural behaviour.  Almost any
deviation from the way Linux does things ends up in situations which
don't function.

~Andrew
George Dunlap Jan. 12, 2016, 11:03 a.m. UTC | #10
On Tue, Jan 12, 2016 at 10:57 AM, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> Writing a PV guest from scratch has been very enlightening to
> demonstrate how much of a trainwreck the ABI is.  Almost nothing is
> documented.  Some bits which are documented are misleading.  Several
> areas needlessly deviate from x86 architectural behaviour.  Almost any
> deviation from the way Linux does things ends up in situations which
> don't function.

Something I think HPA has been complaining about for nearly a decade
now I think...

 -George
Stefano Stabellini Jan. 12, 2016, 12:07 p.m. UTC | #11
On Mon, 11 Jan 2016, David Vrabel wrote:
> On 11/01/16 17:17, Andrew Cooper wrote:
> > So from one point of view, sufficient justification for this change is
> > "because the Linux way isn't the only valid way to do this".
> 
> "Because we can" isn't a good justification for adding something new.
> Particularly something that is trivially easy to (accidentally) misuse
> and open a big security hole between userspace and kernel.
> 
> The vague idea for a userspace netfront that's floating around
> internally is also not a good reason for pushing this feature at this time.

I agree with David, but I might have another good use case for this.

Consider the following scenario: we have a Xen HVM guest, with Xen
installed inside of it (nested virtualization). I'll refer to Xen
running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
Similarly we have two dom0 running, the one with access to the physical
hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.

Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
frontend drivers, netfront and blkfront to speed up execution. In order
to do that, the first thing it needs to do is making an hypercall to L0
Xen. That's because netfront and blkfront needs to communicate with
netback and blkback in L0 Dom0: event channels and grant tables are the
ones provided by L0 Xen.

However L0 Xen refuses hypercalls from ring 3, and L1 Dom0 is running in
ring 3 so unfortunately it cannot actually do it.

With this hypercall in place, it is conceivable for L1 Dom0 to instruct
L1 Xen to make a HVMOP_set_hypercall_dpl call and allow L1 Dom0 to setup
netfront and blkfront with L0 Xen.

Fun, isn't it.
Jan Beulich Jan. 12, 2016, 3:06 p.m. UTC | #12
>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
> On Mon, 11 Jan 2016, David Vrabel wrote:
>> On 11/01/16 17:17, Andrew Cooper wrote:
>> > So from one point of view, sufficient justification for this change is
>> > "because the Linux way isn't the only valid way to do this".
>> 
>> "Because we can" isn't a good justification for adding something new.
>> Particularly something that is trivially easy to (accidentally) misuse
>> and open a big security hole between userspace and kernel.
>> 
>> The vague idea for a userspace netfront that's floating around
>> internally is also not a good reason for pushing this feature at this time.
> 
> I agree with David, but I might have another good use case for this.
> 
> Consider the following scenario: we have a Xen HVM guest, with Xen
> installed inside of it (nested virtualization). I'll refer to Xen
> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> Similarly we have two dom0 running, the one with access to the physical
> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> 
> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> frontend drivers, netfront and blkfront to speed up execution. In order
> to do that, the first thing it needs to do is making an hypercall to L0
> Xen. That's because netfront and blkfront needs to communicate with
> netback and blkback in L0 Dom0: event channels and grant tables are the
> ones provided by L0 Xen.

That's again a layering violation (bypassing the L1 hypervisor).

Jan
Stefano Stabellini Jan. 12, 2016, 5:05 p.m. UTC | #13
On Tue, 12 Jan 2016, Jan Beulich wrote:
> >>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
> > On Mon, 11 Jan 2016, David Vrabel wrote:
> >> On 11/01/16 17:17, Andrew Cooper wrote:
> >> > So from one point of view, sufficient justification for this change is
> >> > "because the Linux way isn't the only valid way to do this".
> >> 
> >> "Because we can" isn't a good justification for adding something new.
> >> Particularly something that is trivially easy to (accidentally) misuse
> >> and open a big security hole between userspace and kernel.
> >> 
> >> The vague idea for a userspace netfront that's floating around
> >> internally is also not a good reason for pushing this feature at this time.
> > 
> > I agree with David, but I might have another good use case for this.
> > 
> > Consider the following scenario: we have a Xen HVM guest, with Xen
> > installed inside of it (nested virtualization). I'll refer to Xen
> > running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> > Similarly we have two dom0 running, the one with access to the physical
> > hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> > 
> > Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> > frontend drivers, netfront and blkfront to speed up execution. In order
> > to do that, the first thing it needs to do is making an hypercall to L0
> > Xen. That's because netfront and blkfront needs to communicate with
> > netback and blkback in L0 Dom0: event channels and grant tables are the
> > ones provided by L0 Xen.
> 
> That's again a layering violation (bypassing the L1 hypervisor).

True, but in this scenario it might be necessary for performance
reasons: otherwise every hypercall would need to bounce off L1 Xen,
possibly cancelling the benefits of running netfront and blkfront in the
first place. I don't have numbers though.
Jürgen Groß Jan. 12, 2016, 5:10 p.m. UTC | #14
On 12/01/16 18:05, Stefano Stabellini wrote:
> On Tue, 12 Jan 2016, Jan Beulich wrote:
>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
>>> On Mon, 11 Jan 2016, David Vrabel wrote:
>>>> On 11/01/16 17:17, Andrew Cooper wrote:
>>>>> So from one point of view, sufficient justification for this change is
>>>>> "because the Linux way isn't the only valid way to do this".
>>>>
>>>> "Because we can" isn't a good justification for adding something new.
>>>> Particularly something that is trivially easy to (accidentally) misuse
>>>> and open a big security hole between userspace and kernel.
>>>>
>>>> The vague idea for a userspace netfront that's floating around
>>>> internally is also not a good reason for pushing this feature at this time.
>>>
>>> I agree with David, but I might have another good use case for this.
>>>
>>> Consider the following scenario: we have a Xen HVM guest, with Xen
>>> installed inside of it (nested virtualization). I'll refer to Xen
>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
>>> Similarly we have two dom0 running, the one with access to the physical
>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
>>>
>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
>>> frontend drivers, netfront and blkfront to speed up execution. In order
>>> to do that, the first thing it needs to do is making an hypercall to L0
>>> Xen. That's because netfront and blkfront needs to communicate with
>>> netback and blkback in L0 Dom0: event channels and grant tables are the
>>> ones provided by L0 Xen.
>>
>> That's again a layering violation (bypassing the L1 hypervisor).
> 
> True, but in this scenario it might be necessary for performance
> reasons: otherwise every hypercall would need to bounce off L1 Xen,
> possibly cancelling the benefits of running netfront and blkfront in the
> first place. I don't have numbers though.

How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
hypervisor? How can it select the hypervisor it is talking to?

Juergen
Stefano Stabellini Jan. 12, 2016, 5:23 p.m. UTC | #15
On Tue, 12 Jan 2016, Juergen Gross wrote:
> On 12/01/16 18:05, Stefano Stabellini wrote:
> > On Tue, 12 Jan 2016, Jan Beulich wrote:
> >>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
> >>> On Mon, 11 Jan 2016, David Vrabel wrote:
> >>>> On 11/01/16 17:17, Andrew Cooper wrote:
> >>>>> So from one point of view, sufficient justification for this change is
> >>>>> "because the Linux way isn't the only valid way to do this".
> >>>>
> >>>> "Because we can" isn't a good justification for adding something new.
> >>>> Particularly something that is trivially easy to (accidentally) misuse
> >>>> and open a big security hole between userspace and kernel.
> >>>>
> >>>> The vague idea for a userspace netfront that's floating around
> >>>> internally is also not a good reason for pushing this feature at this time.
> >>>
> >>> I agree with David, but I might have another good use case for this.
> >>>
> >>> Consider the following scenario: we have a Xen HVM guest, with Xen
> >>> installed inside of it (nested virtualization). I'll refer to Xen
> >>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> >>> Similarly we have two dom0 running, the one with access to the physical
> >>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> >>>
> >>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> >>> frontend drivers, netfront and blkfront to speed up execution. In order
> >>> to do that, the first thing it needs to do is making an hypercall to L0
> >>> Xen. That's because netfront and blkfront needs to communicate with
> >>> netback and blkback in L0 Dom0: event channels and grant tables are the
> >>> ones provided by L0 Xen.
> >>
> >> That's again a layering violation (bypassing the L1 hypervisor).
> > 
> > True, but in this scenario it might be necessary for performance
> > reasons: otherwise every hypercall would need to bounce off L1 Xen,
> > possibly cancelling the benefits of running netfront and blkfront in the
> > first place. I don't have numbers though.
> 
> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
> hypervisor? How can it select the hypervisor it is talking to?

From L0 Xen point of view, the guest is just a normal PV on HVM guest,
it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
writing to the right MSR, retrieved via cpuid, then calling into the
hypercall page. L0 Xen would populate the hypercall page as usual for PV
on HVM guests, using vmcall instructions. However once that is done, L0
Xen would still refuse the hypercalls because they are issued from ring
3.
Jürgen Groß Jan. 13, 2016, 5:12 a.m. UTC | #16
On 12/01/16 18:23, Stefano Stabellini wrote:
> On Tue, 12 Jan 2016, Juergen Gross wrote:
>> On 12/01/16 18:05, Stefano Stabellini wrote:
>>> On Tue, 12 Jan 2016, Jan Beulich wrote:
>>>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
>>>>> On Mon, 11 Jan 2016, David Vrabel wrote:
>>>>>> On 11/01/16 17:17, Andrew Cooper wrote:
>>>>>>> So from one point of view, sufficient justification for this change is
>>>>>>> "because the Linux way isn't the only valid way to do this".
>>>>>>
>>>>>> "Because we can" isn't a good justification for adding something new.
>>>>>> Particularly something that is trivially easy to (accidentally) misuse
>>>>>> and open a big security hole between userspace and kernel.
>>>>>>
>>>>>> The vague idea for a userspace netfront that's floating around
>>>>>> internally is also not a good reason for pushing this feature at this time.
>>>>>
>>>>> I agree with David, but I might have another good use case for this.
>>>>>
>>>>> Consider the following scenario: we have a Xen HVM guest, with Xen
>>>>> installed inside of it (nested virtualization). I'll refer to Xen
>>>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
>>>>> Similarly we have two dom0 running, the one with access to the physical
>>>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
>>>>>
>>>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
>>>>> frontend drivers, netfront and blkfront to speed up execution. In order
>>>>> to do that, the first thing it needs to do is making an hypercall to L0
>>>>> Xen. That's because netfront and blkfront needs to communicate with
>>>>> netback and blkback in L0 Dom0: event channels and grant tables are the
>>>>> ones provided by L0 Xen.
>>>>
>>>> That's again a layering violation (bypassing the L1 hypervisor).
>>>
>>> True, but in this scenario it might be necessary for performance
>>> reasons: otherwise every hypercall would need to bounce off L1 Xen,
>>> possibly cancelling the benefits of running netfront and blkfront in the
>>> first place. I don't have numbers though.
>>
>> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
>> hypervisor? How can it select the hypervisor it is talking to?
> 
>>From L0 Xen point of view, the guest is just a normal PV on HVM guest,
> it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
> to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
> writing to the right MSR, retrieved via cpuid, then calling into the

But how to specify that cpuid/MSR should target the L0 hypervisor
instead of L1? And even if this would be working, just by mapping
the correct page the instructions doing the transition to the
hypervisor would still result in entering the L1 hypervisor, as
those instructions must be handled by L1 first in order to make
nested virtualization work.


Juergen
Stefano Stabellini Jan. 13, 2016, 10:41 a.m. UTC | #17
On Wed, 13 Jan 2016, Juergen Gross wrote:
> On 12/01/16 18:23, Stefano Stabellini wrote:
> > On Tue, 12 Jan 2016, Juergen Gross wrote:
> >> On 12/01/16 18:05, Stefano Stabellini wrote:
> >>> On Tue, 12 Jan 2016, Jan Beulich wrote:
> >>>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
> >>>>> On Mon, 11 Jan 2016, David Vrabel wrote:
> >>>>>> On 11/01/16 17:17, Andrew Cooper wrote:
> >>>>>>> So from one point of view, sufficient justification for this change is
> >>>>>>> "because the Linux way isn't the only valid way to do this".
> >>>>>>
> >>>>>> "Because we can" isn't a good justification for adding something new.
> >>>>>> Particularly something that is trivially easy to (accidentally) misuse
> >>>>>> and open a big security hole between userspace and kernel.
> >>>>>>
> >>>>>> The vague idea for a userspace netfront that's floating around
> >>>>>> internally is also not a good reason for pushing this feature at this time.
> >>>>>
> >>>>> I agree with David, but I might have another good use case for this.
> >>>>>
> >>>>> Consider the following scenario: we have a Xen HVM guest, with Xen
> >>>>> installed inside of it (nested virtualization). I'll refer to Xen
> >>>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> >>>>> Similarly we have two dom0 running, the one with access to the physical
> >>>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> >>>>>
> >>>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> >>>>> frontend drivers, netfront and blkfront to speed up execution. In order
> >>>>> to do that, the first thing it needs to do is making an hypercall to L0
> >>>>> Xen. That's because netfront and blkfront needs to communicate with
> >>>>> netback and blkback in L0 Dom0: event channels and grant tables are the
> >>>>> ones provided by L0 Xen.
> >>>>
> >>>> That's again a layering violation (bypassing the L1 hypervisor).
> >>>
> >>> True, but in this scenario it might be necessary for performance
> >>> reasons: otherwise every hypercall would need to bounce off L1 Xen,
> >>> possibly cancelling the benefits of running netfront and blkfront in the
> >>> first place. I don't have numbers though.
> >>
> >> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
> >> hypervisor? How can it select the hypervisor it is talking to?
> > 
> >>From L0 Xen point of view, the guest is just a normal PV on HVM guest,
> > it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
> > to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
> > writing to the right MSR, retrieved via cpuid, then calling into the
> 
> But how to specify that cpuid/MSR should target the L0 hypervisor
> instead of L1?

Keeping in mind that L1 Dom0 is a PV guest from L1 Xen point of view,
but a PV on HVM guest from L0 Xen point of view, it is true that the
cpuid could be an issue because the cpuid would be generated by L0 Xen,
but then would get filtered by L1 Xen. However the MSR should be OK,
assuming that L1 Xen allows access to it: from inside the VM it would
look like a regular machine MSR, it couldn't get confused with anything
causing hypercalls to L1 Xen.


> And even if this would be working, just by mapping
> the correct page the instructions doing the transition to the
> hypervisor would still result in entering the L1 hypervisor, as
> those instructions must be handled by L1 first in order to make
> nested virtualization work.

This is wrong. The hypercall page populated by L0 Xen would contain
vmcall instructions. When L1 Dom0 calls into the hypercall page, it
would end up making a vmcall, which brings it directly to L0 Xen,
skipping L1 Xen.
Jürgen Groß Jan. 13, 2016, 11:14 a.m. UTC | #18
On 13/01/16 11:41, Stefano Stabellini wrote:
> On Wed, 13 Jan 2016, Juergen Gross wrote:
>> On 12/01/16 18:23, Stefano Stabellini wrote:
>>> On Tue, 12 Jan 2016, Juergen Gross wrote:
>>>> On 12/01/16 18:05, Stefano Stabellini wrote:
>>>>> On Tue, 12 Jan 2016, Jan Beulich wrote:
>>>>>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
>>>>>>> On Mon, 11 Jan 2016, David Vrabel wrote:
>>>>>>>> On 11/01/16 17:17, Andrew Cooper wrote:
>>>>>>>>> So from one point of view, sufficient justification for this change is
>>>>>>>>> "because the Linux way isn't the only valid way to do this".
>>>>>>>>
>>>>>>>> "Because we can" isn't a good justification for adding something new.
>>>>>>>> Particularly something that is trivially easy to (accidentally) misuse
>>>>>>>> and open a big security hole between userspace and kernel.
>>>>>>>>
>>>>>>>> The vague idea for a userspace netfront that's floating around
>>>>>>>> internally is also not a good reason for pushing this feature at this time.
>>>>>>>
>>>>>>> I agree with David, but I might have another good use case for this.
>>>>>>>
>>>>>>> Consider the following scenario: we have a Xen HVM guest, with Xen
>>>>>>> installed inside of it (nested virtualization). I'll refer to Xen
>>>>>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
>>>>>>> Similarly we have two dom0 running, the one with access to the physical
>>>>>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
>>>>>>>
>>>>>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
>>>>>>> frontend drivers, netfront and blkfront to speed up execution. In order
>>>>>>> to do that, the first thing it needs to do is making an hypercall to L0
>>>>>>> Xen. That's because netfront and blkfront needs to communicate with
>>>>>>> netback and blkback in L0 Dom0: event channels and grant tables are the
>>>>>>> ones provided by L0 Xen.
>>>>>>
>>>>>> That's again a layering violation (bypassing the L1 hypervisor).
>>>>>
>>>>> True, but in this scenario it might be necessary for performance
>>>>> reasons: otherwise every hypercall would need to bounce off L1 Xen,
>>>>> possibly cancelling the benefits of running netfront and blkfront in the
>>>>> first place. I don't have numbers though.
>>>>
>>>> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
>>>> hypervisor? How can it select the hypervisor it is talking to?
>>>
>>> >From L0 Xen point of view, the guest is just a normal PV on HVM guest,
>>> it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
>>> to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
>>> writing to the right MSR, retrieved via cpuid, then calling into the
>>
>> But how to specify that cpuid/MSR should target the L0 hypervisor
>> instead of L1?
> 
> Keeping in mind that L1 Dom0 is a PV guest from L1 Xen point of view,
> but a PV on HVM guest from L0 Xen point of view, it is true that the
> cpuid could be an issue because the cpuid would be generated by L0 Xen,
> but then would get filtered by L1 Xen. However the MSR should be OK,
> assuming that L1 Xen allows access to it: from inside the VM it would
> look like a regular machine MSR, it couldn't get confused with anything
> causing hypercalls to L1 Xen.

L1 Xen wouldn't allow access to it. Otherwise it couldn't ever setup
a hypercall page for one of it's guests.

>> And even if this would be working, just by mapping
>> the correct page the instructions doing the transition to the
>> hypervisor would still result in entering the L1 hypervisor, as
>> those instructions must be handled by L1 first in order to make
>> nested virtualization work.
> 
> This is wrong. The hypercall page populated by L0 Xen would contain
> vmcall instructions. When L1 Dom0 calls into the hypercall page, it
> would end up making a vmcall, which brings it directly to L0 Xen,
> skipping L1 Xen.

Sure. And L0 Xen will see that this guest is subject to nested
virtualization and is reflecting the vmcall to L1 Xen (see e.g.
xen/arch/x86/hvm/svm/nestedsvm.c, nestedsvm_check_intercepts() ).
How else would L1 Xen ever get a vmcall of one of it's guests?


Juergen
Stefano Stabellini Jan. 13, 2016, 11:26 a.m. UTC | #19
On Wed, 13 Jan 2016, Juergen Gross wrote:
> On 13/01/16 11:41, Stefano Stabellini wrote:
> > On Wed, 13 Jan 2016, Juergen Gross wrote:
> >> On 12/01/16 18:23, Stefano Stabellini wrote:
> >>> On Tue, 12 Jan 2016, Juergen Gross wrote:
> >>>> On 12/01/16 18:05, Stefano Stabellini wrote:
> >>>>> On Tue, 12 Jan 2016, Jan Beulich wrote:
> >>>>>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
> >>>>>>> On Mon, 11 Jan 2016, David Vrabel wrote:
> >>>>>>>> On 11/01/16 17:17, Andrew Cooper wrote:
> >>>>>>>>> So from one point of view, sufficient justification for this change is
> >>>>>>>>> "because the Linux way isn't the only valid way to do this".
> >>>>>>>>
> >>>>>>>> "Because we can" isn't a good justification for adding something new.
> >>>>>>>> Particularly something that is trivially easy to (accidentally) misuse
> >>>>>>>> and open a big security hole between userspace and kernel.
> >>>>>>>>
> >>>>>>>> The vague idea for a userspace netfront that's floating around
> >>>>>>>> internally is also not a good reason for pushing this feature at this time.
> >>>>>>>
> >>>>>>> I agree with David, but I might have another good use case for this.
> >>>>>>>
> >>>>>>> Consider the following scenario: we have a Xen HVM guest, with Xen
> >>>>>>> installed inside of it (nested virtualization). I'll refer to Xen
> >>>>>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> >>>>>>> Similarly we have two dom0 running, the one with access to the physical
> >>>>>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> >>>>>>>
> >>>>>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> >>>>>>> frontend drivers, netfront and blkfront to speed up execution. In order
> >>>>>>> to do that, the first thing it needs to do is making an hypercall to L0
> >>>>>>> Xen. That's because netfront and blkfront needs to communicate with
> >>>>>>> netback and blkback in L0 Dom0: event channels and grant tables are the
> >>>>>>> ones provided by L0 Xen.
> >>>>>>
> >>>>>> That's again a layering violation (bypassing the L1 hypervisor).
> >>>>>
> >>>>> True, but in this scenario it might be necessary for performance
> >>>>> reasons: otherwise every hypercall would need to bounce off L1 Xen,
> >>>>> possibly cancelling the benefits of running netfront and blkfront in the
> >>>>> first place. I don't have numbers though.
> >>>>
> >>>> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
> >>>> hypervisor? How can it select the hypervisor it is talking to?
> >>>
> >>> >From L0 Xen point of view, the guest is just a normal PV on HVM guest,
> >>> it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
> >>> to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
> >>> writing to the right MSR, retrieved via cpuid, then calling into the
> >>
> >> But how to specify that cpuid/MSR should target the L0 hypervisor
> >> instead of L1?
> > 
> > Keeping in mind that L1 Dom0 is a PV guest from L1 Xen point of view,
> > but a PV on HVM guest from L0 Xen point of view, it is true that the
> > cpuid could be an issue because the cpuid would be generated by L0 Xen,
> > but then would get filtered by L1 Xen. However the MSR should be OK,
> > assuming that L1 Xen allows access to it: from inside the VM it would
> > look like a regular machine MSR, it couldn't get confused with anything
> > causing hypercalls to L1 Xen.
> 
> L1 Xen wouldn't allow access to it. Otherwise it couldn't ever setup
> a hypercall page for one of it's guests.

If they are PV guests, the hypercall page is not mapped via MSRs.


> >> And even if this would be working, just by mapping
> >> the correct page the instructions doing the transition to the
> >> hypervisor would still result in entering the L1 hypervisor, as
> >> those instructions must be handled by L1 first in order to make
> >> nested virtualization work.
> > 
> > This is wrong. The hypercall page populated by L0 Xen would contain
> > vmcall instructions. When L1 Dom0 calls into the hypercall page, it
> > would end up making a vmcall, which brings it directly to L0 Xen,
> > skipping L1 Xen.
> 
> Sure. And L0 Xen will see that this guest is subject to nested
> virtualization and is reflecting the vmcall to L1 Xen (see e.g.
> xen/arch/x86/hvm/svm/nestedsvm.c, nestedsvm_check_intercepts() ).
> How else would L1 Xen ever get a vmcall of one of it's guests?

Only if nested_hvm is enabled, I believe. Sorry for not being clearer
earlier: I am talking about nested virtualization without nested
vmx/svm, which is the default.

But you have a good point there: if nested_hvm is enabled, there should
still be a way for L1 Dom0 to issue an hypercall to L0 Xen, otherwise
how could L1 Dom0 ever setup netfront and blkfront?
Jürgen Groß Jan. 13, 2016, 11:32 a.m. UTC | #20
On 13/01/16 12:26, Stefano Stabellini wrote:
> On Wed, 13 Jan 2016, Juergen Gross wrote:
>> On 13/01/16 11:41, Stefano Stabellini wrote:
>>> On Wed, 13 Jan 2016, Juergen Gross wrote:
>>>> On 12/01/16 18:23, Stefano Stabellini wrote:
>>>>> On Tue, 12 Jan 2016, Juergen Gross wrote:
>>>>>> On 12/01/16 18:05, Stefano Stabellini wrote:
>>>>>>> On Tue, 12 Jan 2016, Jan Beulich wrote:
>>>>>>>>>>> On 12.01.16 at 13:07, <stefano.stabellini@eu.citrix.com> wrote:
>>>>>>>>> On Mon, 11 Jan 2016, David Vrabel wrote:
>>>>>>>>>> On 11/01/16 17:17, Andrew Cooper wrote:
>>>>>>>>>>> So from one point of view, sufficient justification for this change is
>>>>>>>>>>> "because the Linux way isn't the only valid way to do this".
>>>>>>>>>>
>>>>>>>>>> "Because we can" isn't a good justification for adding something new.
>>>>>>>>>> Particularly something that is trivially easy to (accidentally) misuse
>>>>>>>>>> and open a big security hole between userspace and kernel.
>>>>>>>>>>
>>>>>>>>>> The vague idea for a userspace netfront that's floating around
>>>>>>>>>> internally is also not a good reason for pushing this feature at this time.
>>>>>>>>>
>>>>>>>>> I agree with David, but I might have another good use case for this.
>>>>>>>>>
>>>>>>>>> Consider the following scenario: we have a Xen HVM guest, with Xen
>>>>>>>>> installed inside of it (nested virtualization). I'll refer to Xen
>>>>>>>>> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
>>>>>>>>> Similarly we have two dom0 running, the one with access to the physical
>>>>>>>>> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
>>>>>>>>>
>>>>>>>>> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
>>>>>>>>> frontend drivers, netfront and blkfront to speed up execution. In order
>>>>>>>>> to do that, the first thing it needs to do is making an hypercall to L0
>>>>>>>>> Xen. That's because netfront and blkfront needs to communicate with
>>>>>>>>> netback and blkback in L0 Dom0: event channels and grant tables are the
>>>>>>>>> ones provided by L0 Xen.
>>>>>>>>
>>>>>>>> That's again a layering violation (bypassing the L1 hypervisor).
>>>>>>>
>>>>>>> True, but in this scenario it might be necessary for performance
>>>>>>> reasons: otherwise every hypercall would need to bounce off L1 Xen,
>>>>>>> possibly cancelling the benefits of running netfront and blkfront in the
>>>>>>> first place. I don't have numbers though.
>>>>>>
>>>>>> How is this supposed to work? How can dom0 make hypercalls to L1 _or_ L0
>>>>>> hypervisor? How can it select the hypervisor it is talking to?
>>>>>
>>>>> >From L0 Xen point of view, the guest is just a normal PV on HVM guest,
>>>>> it doesn't matter what's inside, so L1 Dom0 is going to make hypercalls
>>>>> to L0 Xen like any other PV on HVM guests: mapping the hypercall page by
>>>>> writing to the right MSR, retrieved via cpuid, then calling into the
>>>>
>>>> But how to specify that cpuid/MSR should target the L0 hypervisor
>>>> instead of L1?
>>>
>>> Keeping in mind that L1 Dom0 is a PV guest from L1 Xen point of view,
>>> but a PV on HVM guest from L0 Xen point of view, it is true that the
>>> cpuid could be an issue because the cpuid would be generated by L0 Xen,
>>> but then would get filtered by L1 Xen. However the MSR should be OK,
>>> assuming that L1 Xen allows access to it: from inside the VM it would
>>> look like a regular machine MSR, it couldn't get confused with anything
>>> causing hypercalls to L1 Xen.
>>
>> L1 Xen wouldn't allow access to it. Otherwise it couldn't ever setup
>> a hypercall page for one of it's guests.
> 
> If they are PV guests, the hypercall page is not mapped via MSRs.
> 
> 
>>>> And even if this would be working, just by mapping
>>>> the correct page the instructions doing the transition to the
>>>> hypervisor would still result in entering the L1 hypervisor, as
>>>> those instructions must be handled by L1 first in order to make
>>>> nested virtualization work.
>>>
>>> This is wrong. The hypercall page populated by L0 Xen would contain
>>> vmcall instructions. When L1 Dom0 calls into the hypercall page, it
>>> would end up making a vmcall, which brings it directly to L0 Xen,
>>> skipping L1 Xen.
>>
>> Sure. And L0 Xen will see that this guest is subject to nested
>> virtualization and is reflecting the vmcall to L1 Xen (see e.g.
>> xen/arch/x86/hvm/svm/nestedsvm.c, nestedsvm_check_intercepts() ).
>> How else would L1 Xen ever get a vmcall of one of it's guests?
> 
> Only if nested_hvm is enabled, I believe. Sorry for not being clearer
> earlier: I am talking about nested virtualization without nested
> vmx/svm, which is the default.

Same applies to the MSR topic above, I guess.

> But you have a good point there: if nested_hvm is enabled, there should
> still be a way for L1 Dom0 to issue an hypercall to L0 Xen, otherwise
> how could L1 Dom0 ever setup netfront and blkfront?

You would have to add a L1 hypercall to do this. And that was Jan's
point, I suppose.

Juergen
David Vrabel Jan. 13, 2016, 11:42 a.m. UTC | #21
On 12/01/16 12:07, Stefano Stabellini wrote:
> On Mon, 11 Jan 2016, David Vrabel wrote:
>> On 11/01/16 17:17, Andrew Cooper wrote:
>>> So from one point of view, sufficient justification for this change is
>>> "because the Linux way isn't the only valid way to do this".
>>
>> "Because we can" isn't a good justification for adding something new.
>> Particularly something that is trivially easy to (accidentally) misuse
>> and open a big security hole between userspace and kernel.
>>
>> The vague idea for a userspace netfront that's floating around
>> internally is also not a good reason for pushing this feature at this time.
> 
> I agree with David, but I might have another good use case for this.
> 
> Consider the following scenario: we have a Xen HVM guest, with Xen
> installed inside of it (nested virtualization). I'll refer to Xen
> running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> Similarly we have two dom0 running, the one with access to the physical
> hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> 
> Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> frontend drivers, netfront and blkfront to speed up execution.

This needs much more than just the ability for the L1 dom0 to make
hypercalls to the L0 hypervisor.

There's event channels, xenstore, grant tables etc. etc.

The dom0 kernel would have to gain an additional set of xenbus frontend
drivers and duplicates of much of drivers/xen/ in Linux that know to
connect to the L0 instead of L1.

Instead, use virtio which will (almost) just work right now.

David
Stefano Stabellini Jan. 13, 2016, 12:51 p.m. UTC | #22
On Wed, 13 Jan 2016, David Vrabel wrote:
> On 12/01/16 12:07, Stefano Stabellini wrote:
> > On Mon, 11 Jan 2016, David Vrabel wrote:
> >> On 11/01/16 17:17, Andrew Cooper wrote:
> >>> So from one point of view, sufficient justification for this change is
> >>> "because the Linux way isn't the only valid way to do this".
> >>
> >> "Because we can" isn't a good justification for adding something new.
> >> Particularly something that is trivially easy to (accidentally) misuse
> >> and open a big security hole between userspace and kernel.
> >>
> >> The vague idea for a userspace netfront that's floating around
> >> internally is also not a good reason for pushing this feature at this time.
> > 
> > I agree with David, but I might have another good use case for this.
> > 
> > Consider the following scenario: we have a Xen HVM guest, with Xen
> > installed inside of it (nested virtualization). I'll refer to Xen
> > running on the host as L0 Xen and Xen running inside the VM as L1 Xen.
> > Similarly we have two dom0 running, the one with access to the physical
> > hardware, L0 Dom0, and the one running inside the VM, L1 Dom0.
> > 
> > Let's suppose that we want to lay the groundwork for L1 Dom0 to use PV
> > frontend drivers, netfront and blkfront to speed up execution.
> 
> This needs much more than just the ability for the L1 dom0 to make
> hypercalls to the L0 hypervisor.
> 
> There's event channels, xenstore, grant tables etc. etc.
> 
> The dom0 kernel would have to gain an additional set of xenbus frontend
> drivers and duplicates of much of drivers/xen/ in Linux that know to
> connect to the L0 instead of L1.

Yes, that's correct.


> Instead, use virtio which will (almost) just work right now.

True, I thought about that too, but one doesn't always get to choose the
set of emulated hardware it gets. On most clouds or virtualization
products, virtio for Xen VMs is not an option, and it doesn't look like
it is going to be an option in the near future either.

That said, even if we add a HVMOP_set_hypercall_dpl hypercall, I agree
that the complexity of duplicating drivers/xen would probably not make
the whole thing worth while, at least in the Linux kernel.
Ian Campbell Jan. 14, 2016, 10:50 a.m. UTC | #23
On Mon, 2016-01-11 at 13:59 +0000, Andrew Cooper wrote:
> Arm folks: Is something like this sufficiently generic to be useful on
> Arm, perhaps with more generic naming?

ARM's HVC instruction is always invalid from userspace, i.e. it would
result in #UNDEF and there is no control bit to make it do otherwise, so I
think there isn't much point.

Ian.

Patch
diff mbox

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 21470ec..e5a08db 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -5228,7 +5228,8 @@  int hvm_do_hypercall(struct cpu_user_regs *regs)
     case 4:
     case 2:
         hvm_get_segment_register(curr, x86_seg_ss, &sreg);
-        if ( unlikely(sreg.attr.fields.dpl) )
+        if ( unlikely(sreg.attr.fields.dpl <
+                      currd->arch.hvm_domain.hypercall_dpl) )
         {
     default:
             regs->eax = -EPERM;
@@ -6839,6 +6840,28 @@  long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
         rc = do_altp2m_op(arg);
         break;
 
+    case HVMOP_set_hypercall_dpl:
+    {
+        xen_hvm_hypercall_dpl_t a;
+        struct domain *d;
+
+        if ( copy_from_guest(&a, arg, 1 ) )
+            return -EFAULT;
+
+        rc = rcu_lock_remote_domain_by_id(a.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( current->domain != d )
+            return -EPERM;
+
+        if ( !is_hvm_domain(d) || a.dpl > 3 )
+            return -EINVAL;
+
+        d->arch.hvm_domain.hypercall_dpl = a.dpl;
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op);
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index a8cc2ad..006a142 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -137,6 +137,8 @@  struct hvm_domain {
     bool_t                 qemu_mapcache_invalidate;
     bool_t                 is_s3_suspended;
 
+    uint32_t               hypercall_dpl;
+
     /*
      * TSC value that VCPUs use to calculate their tsc_offset value.
      * Used during initialization and save/restore.
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
index 1606185..f8247db 100644
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -489,6 +489,14 @@  struct xen_hvm_altp2m_op {
 typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
 
+#define HVMOP_set_hypercall_dpl 26
+struct xen_hvm_hypercall_dpl {
+    domid_t domid;
+    uint16_t dpl;  /* IN[1:0] cpl required to make hypercalls. */
+};
+typedef struct xen_hvm_hypercall_dpl xen_hvm_hypercall_dpl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_hypercall_dpl_t);
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
 
 /*