diff mbox series

[1/6] x86emul: generalize wbinvd() hook

Message ID 3f30c73d-94a7-f9ca-5914-0400f1f98cc3@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: further work | expand

Commit Message

Jan Beulich July 1, 2019, 11:56 a.m. UTC
The hook is already in use for other purposes, and emulating e.g.
CLFLUSH by issuing WBINVD is, well, not very nice. Rename the hook and
add parameters. Use lighter weight flushing insns when possible in
hvmemul_cache_op().

hvmemul_cache_op() treating x86emul_invd the same as x86emul_wbinvd is
to retain original behavior, but I'm not sure this is what we want in
the long run.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Use cache_op() as hook name. Convert macros to inline functions in
     system.h. Re-base.
---
I was unsure about PREFETCH* and CLDEMOTE - both are cache management
insns too, but the emulator currently treats them as a NOP without
invoking any hooks.
I was also uncertain about the new cache_flush_permitted() instance -
generally I think it wouldn't be too bad if we allowed line flushes in
all cases, in which case the checks in the ->wbinvd_intercept() handlers
would suffice (as they did until now).

Comments

Paul Durrant July 2, 2019, 10:22 a.m. UTC | #1
> -----Original Message-----
> From: Jan Beulich <JBeulich@suse.com>
> Sent: 01 July 2019 12:56
> To: xen-devel@lists.xenproject.org
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Wei Liu <wl@xen.org>
> Subject: [PATCH 1/6] x86emul: generalize wbinvd() hook
> 
> The hook is already in use for other purposes, and emulating e.g.
> CLFLUSH by issuing WBINVD is, well, not very nice. Rename the hook and
> add parameters. Use lighter weight flushing insns when possible in
> hvmemul_cache_op().
> 
> hvmemul_cache_op() treating x86emul_invd the same as x86emul_wbinvd is
> to retain original behavior, but I'm not sure this is what we want in
> the long run.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Use cache_op() as hook name. Convert macros to inline functions in
>      system.h. Re-base.
> ---
> I was unsure about PREFETCH* and CLDEMOTE - both are cache management
> insns too, but the emulator currently treats them as a NOP without
> invoking any hooks.
> I was also uncertain about the new cache_flush_permitted() instance -
> generally I think it wouldn't be too bad if we allowed line flushes in
> all cases, in which case the checks in the ->wbinvd_intercept() handlers
> would suffice (as they did until now).
>
[snip]
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -25,6 +25,7 @@
>   #include <asm/hvm/trace.h>
>   #include <asm/hvm/support.h>
>   #include <asm/hvm/svm/svm.h>
> +#include <asm/iocap.h>
>   #include <asm/vm_event.h>
> 
>   static void hvmtrace_io_assist(const ioreq_t *p)
> @@ -555,16 +556,12 @@ static void *hvmemul_map_linear_addr(
>       mfn_t *mfn = &hvmemul_ctxt->mfn[0];
> 
>       /*
> -     * The caller has no legitimate reason for trying a zero-byte write, but
> -     * all other code here is written to work if the check below was dropped.
> -     *
> -     * The maximum write size depends on the number of adjacent mfns[] which
> +     * The maximum access size depends on the number of adjacent mfns[] which
>        * can be vmap()'d, accouting for possible misalignment within the region.
>        * The higher level emulation callers are responsible for ensuring that
> -     * mfns[] is large enough for the requested write size.
> +     * mfns[] is large enough for the requested access size.
>        */
> -    if ( bytes == 0 ||
> -         nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
> +    if ( nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
>       {
>           ASSERT_UNREACHABLE();
>           goto unhandleable;
> @@ -669,8 +666,6 @@ static void hvmemul_unmap_linear_addr(
>       unsigned int i;
>       mfn_t *mfn = &hvmemul_ctxt->mfn[0];
> 
> -    ASSERT(bytes > 0);
> -
>       if ( nr_frames == 1 )
>           unmap_domain_page(mapping);
>       else
> @@ -1473,7 +1468,10 @@ static int hvmemul_write_msr_discard(
>       return X86EMUL_OKAY;
>   }
> 
> -static int hvmemul_wbinvd_discard(
> +static int hvmemul_cache_op_discard(
> +    enum x86emul_cache_op op,
> +    enum x86_segment seg,
> +    unsigned long offset,
>       struct x86_emulate_ctxt *ctxt)
>   {
>       return X86EMUL_OKAY;
> @@ -2149,10 +2147,65 @@ static int hvmemul_write_msr(
>       return rc;
>   }
> 
> -static int hvmemul_wbinvd(
> +static int hvmemul_cache_op(
> +    enum x86emul_cache_op op,
> +    enum x86_segment seg,
> +    unsigned long offset,
>       struct x86_emulate_ctxt *ctxt)
>   {
> -    alternative_vcall(hvm_funcs.wbinvd_intercept);
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present;
> +    int rc;
> +    void *mapping;
> +
> +    if ( !cache_flush_permitted(current->domain) )
> +        return X86EMUL_OKAY;
> +
> +    switch ( op )
> +    {
> +    case x86emul_clflush:
> +    case x86emul_clflushopt:
> +    case x86emul_clwb:
> +        ASSERT(!is_x86_system_segment(seg));
> +
> +        rc = hvmemul_virtual_to_linear(seg, offset, 0, &reps,
> +                                       hvm_access_read, hvmemul_ctxt, &addr);
> +        if ( rc != X86EMUL_OKAY )
> +            break;
> +
> +        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +            pfec |= PFEC_user_mode;
> +
> +        mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt,
> +                                          current->arch.hvm.data_cache);
> +        if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
> +            return X86EMUL_EXCEPTION;
> +        if ( IS_ERR_OR_NULL(mapping) )
> +            break;
> +
> +        if ( cpu_has_clflush )
> +        {
> +            if ( op == x86emul_clwb && cpu_has_clwb )
> +                clwb(mapping);
> +            else if ( op == x86emul_clflushopt && cpu_has_clflushopt )
> +                clflushopt(mapping);
> +            else
> +                clflush(mapping);
> +
> +            hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> +            break;
> +        }
> +
> +        hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);

Since the mapping is ditched here, why bother getting one at all in the !cpu_has_clflush case? Are you trying to flush out an error condition that was previously missed?

  Paul
Jan Beulich July 2, 2019, 10:50 a.m. UTC | #2
On 02.07.2019 12:22, Paul Durrant wrote:
>> From: Jan Beulich <JBeulich@suse.com>
>> Sent: 01 July 2019 12:56
>>
>> @@ -2149,10 +2147,65 @@ static int hvmemul_write_msr(
>>        return rc;
>>    }
>>
>> -static int hvmemul_wbinvd(
>> +static int hvmemul_cache_op(
>> +    enum x86emul_cache_op op,
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>>        struct x86_emulate_ctxt *ctxt)
>>    {
>> -    alternative_vcall(hvm_funcs.wbinvd_intercept);
>> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
>> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>> +    unsigned long addr, reps = 1;
>> +    uint32_t pfec = PFEC_page_present;
>> +    int rc;
>> +    void *mapping;
>> +
>> +    if ( !cache_flush_permitted(current->domain) )
>> +        return X86EMUL_OKAY;
>> +
>> +    switch ( op )
>> +    {
>> +    case x86emul_clflush:
>> +    case x86emul_clflushopt:
>> +    case x86emul_clwb:
>> +        ASSERT(!is_x86_system_segment(seg));
>> +
>> +        rc = hvmemul_virtual_to_linear(seg, offset, 0, &reps,
>> +                                       hvm_access_read, hvmemul_ctxt, &addr);
>> +        if ( rc != X86EMUL_OKAY )
>> +            break;
>> +
>> +        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>> +            pfec |= PFEC_user_mode;
>> +
>> +        mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt,
>> +                                          current->arch.hvm.data_cache);
>> +        if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
>> +            return X86EMUL_EXCEPTION;

This return ...

>> +        if ( IS_ERR_OR_NULL(mapping) )
>> +            break;
>> +
>> +        if ( cpu_has_clflush )
>> +        {
>> +            if ( op == x86emul_clwb && cpu_has_clwb )
>> +                clwb(mapping);
>> +            else if ( op == x86emul_clflushopt && cpu_has_clflushopt )
>> +                clflushopt(mapping);
>> +            else
>> +                clflush(mapping);
>> +
>> +            hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
>> +            break;
>> +        }
>> +
>> +        hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> 
> Since the mapping is ditched here, why bother getting one at all in the
> !cpu_has_clflush case? Are you trying to flush out an error condition> that was previously missed?

... is what I'm after: We want exception behavior to be correct.

Jan
Paul Durrant July 2, 2019, 10:53 a.m. UTC | #3
> -----Original Message-----
> From: Jan Beulich <JBeulich@suse.com>
> Sent: 02 July 2019 11:50
> To: Paul Durrant <Paul.Durrant@citrix.com>; xen-devel@lists.xenproject.org
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>; Wei Liu
> <wl@xen.org>
> Subject: Re: [PATCH 1/6] x86emul: generalize wbinvd() hook
> 
> On 02.07.2019 12:22, Paul Durrant wrote:
> >> From: Jan Beulich <JBeulich@suse.com>
> >> Sent: 01 July 2019 12:56
> >>
> >> @@ -2149,10 +2147,65 @@ static int hvmemul_write_msr(
> >>        return rc;
> >>    }
> >>
> >> -static int hvmemul_wbinvd(
> >> +static int hvmemul_cache_op(
> >> +    enum x86emul_cache_op op,
> >> +    enum x86_segment seg,
> >> +    unsigned long offset,
> >>        struct x86_emulate_ctxt *ctxt)
> >>    {
> >> -    alternative_vcall(hvm_funcs.wbinvd_intercept);
> >> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> >> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> >> +    unsigned long addr, reps = 1;
> >> +    uint32_t pfec = PFEC_page_present;
> >> +    int rc;
> >> +    void *mapping;
> >> +
> >> +    if ( !cache_flush_permitted(current->domain) )
> >> +        return X86EMUL_OKAY;
> >> +
> >> +    switch ( op )
> >> +    {
> >> +    case x86emul_clflush:
> >> +    case x86emul_clflushopt:
> >> +    case x86emul_clwb:
> >> +        ASSERT(!is_x86_system_segment(seg));
> >> +
> >> +        rc = hvmemul_virtual_to_linear(seg, offset, 0, &reps,
> >> +                                       hvm_access_read, hvmemul_ctxt, &addr);
> >> +        if ( rc != X86EMUL_OKAY )
> >> +            break;
> >> +
> >> +        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> >> +            pfec |= PFEC_user_mode;
> >> +
> >> +        mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt,
> >> +                                          current->arch.hvm.data_cache);
> >> +        if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
> >> +            return X86EMUL_EXCEPTION;
> 
> This return ...
> 
> >> +        if ( IS_ERR_OR_NULL(mapping) )
> >> +            break;
> >> +
> >> +        if ( cpu_has_clflush )
> >> +        {
> >> +            if ( op == x86emul_clwb && cpu_has_clwb )
> >> +                clwb(mapping);
> >> +            else if ( op == x86emul_clflushopt && cpu_has_clflushopt )
> >> +                clflushopt(mapping);
> >> +            else
> >> +                clflush(mapping);
> >> +
> >> +            hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> >> +            break;
> >> +        }
> >> +
> >> +        hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> >
> > Since the mapping is ditched here, why bother getting one at all in the
> > !cpu_has_clflush case? Are you trying to flush out an error condition> that was previously missed?
> 
> ... is what I'm after: We want exception behavior to be correct.
> 

Ok, fair enough. Just wasn't obvious to me from the commit comment.

  Paul
Andrew Cooper Aug. 27, 2019, 10:44 a.m. UTC | #4
On 01/07/2019 12:56, Jan Beulich wrote:
> The hook is already in use for other purposes, and emulating e.g.
> CLFLUSH by issuing WBINVD is, well, not very nice. Rename the hook and
> add parameters. Use lighter weight flushing insns when possible in
> hvmemul_cache_op().
>
> hvmemul_cache_op() treating x86emul_invd the same as x86emul_wbinvd is
> to retain original behavior, but I'm not sure this is what we want in
> the long run.

There is no use for INVD in a VM, as it is never running with the memory
controllers yet-to-be configured.  The only place it may be found is at
the reset vector for a firmware which doesn't start in a
virtualisation-aware way.

Given how big a hammer WBINVD is, I reckon we should just nop it out.

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

> ---
> v2: Use cache_op() as hook name. Convert macros to inline functions in
>      system.h. Re-base.
> ---
> I was unsure about PREFETCH* and CLDEMOTE - both are cache management
> insns too, but the emulator currently treats them as a NOP without
> invoking any hooks.

They are just hints.  Nothing architecturally may depend on them having
any effect.  CLDEMOTE in particular is for reducing cache coherency
overhead on a producer/consumer setup, but any potential optimisation is
dwarfed by the VMExit.

> I was also uncertain about the new cache_flush_permitted() instance -
> generally I think it wouldn't be too bad if we allowed line flushes in
> all cases, in which case the checks in the ->wbinvd_intercept() handlers
> would suffice (as they did until now).

This is a more general issue which we need to address.  To support
encrypted memory in VM's, we must guarantee that WC mappings which the
guest creates are really WC, which means we must not use IPAT or play
any "fall back to WB" games.

Furthermore, AMD's encrypt-in-place algorithm requires the guest to be
able to use WBINVD.

Fixing this properly will be a good thing, and will fix the fact that at
the moment, any time you give a PCI device to a guest, its blk/net perf
becomes glacial, due to having the grant table being uncached.

> +
> +        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +            pfec |= PFEC_user_mode;

As a note, this is fine here, but this whole system of choosing pfec
needs to be adjusted when we add support for WRUSS, which is a CPL0
instruction that executed as a user mode write, for adjusting the user
shadow stack.

~Andrew
Andrew Cooper Aug. 27, 2019, 12:51 p.m. UTC | #5
On 27/08/2019 11:44, Andrew Cooper wrote:
>> I was also uncertain about the new cache_flush_permitted() instance -
>> generally I think it wouldn't be too bad if we allowed line flushes in
>> all cases, in which case the checks in the ->wbinvd_intercept() handlers
>> would suffice (as they did until now).
> This is a more general issue which we need to address.  To support
> encrypted memory in VM's, we must guarantee that WC mappings which the
> guest creates are really WC, which means we must not use IPAT or play
> any "fall back to WB" games.
>
> Furthermore, AMD's encrypt-in-place algorithm requires the guest to be
> able to use WBINVD.

Apologies.  AMD's algorithm requires aliased WP and WB mappings, not WC.

~Andrew
Andrew Cooper Aug. 27, 2019, 6:47 p.m. UTC | #6
On 27/08/2019 11:44, Andrew Cooper wrote:
>> I was also uncertain about the new cache_flush_permitted() instance -
>> generally I think it wouldn't be too bad if we allowed line flushes in
>> all cases, in which case the checks in the ->wbinvd_intercept() handlers
>> would suffice (as they did until now).
> This is a more general issue which we need to address.  To support
> encrypted memory in VM's, we must guarantee that WC mappings which the
> guest creates are really WC, which means we must not use IPAT or play
> any "fall back to WB" games.
>
> Furthermore, AMD's encrypt-in-place algorithm requires the guest to be
> able to use WBINVD.

Based on some feedback, the encrypt-in-place algorithm is only for
native situations trying to use SME, where the loading kernel/hypervisor
needs to encrypt itself.

SEV guests run encrypted from the start, and have no need to
encrypt/decrypt in place.  The expected way is to copy between an
encrypted and non-encrypted buffer in separate GFNs.

The WBINVD is to ensure that dirty cache lines aren't corrupted by the
WP mapping, and CLFLUSH (or any equivalent full eviction) can be
substituted, but for the intended usecase, a single WBINVD is the
optimum strategy.

~Andrew
Jan Beulich Sept. 2, 2019, 11:10 a.m. UTC | #7
On 01.07.2019 13:55, Jan Beulich wrote:
> The hook is already in use for other purposes, and emulating e.g.
> CLFLUSH by issuing WBINVD is, well, not very nice. Rename the hook and
> add parameters. Use lighter weight flushing insns when possible in
> hvmemul_cache_op().
> 
> hvmemul_cache_op() treating x86emul_invd the same as x86emul_wbinvd is
> to retain original behavior, but I'm not sure this is what we want in
> the long run.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Paul,

any chance I could get your ack (or otherwise) here? I thought I did
answer the one question you had raised to your satisfaction.

Thanks, Jan

> ---
> v2: Use cache_op() as hook name. Convert macros to inline functions in
>      system.h. Re-base.
> ---
> I was unsure about PREFETCH* and CLDEMOTE - both are cache management
> insns too, but the emulator currently treats them as a NOP without
> invoking any hooks.
> I was also uncertain about the new cache_flush_permitted() instance -
> generally I think it wouldn't be too bad if we allowed line flushes in
> all cases, in which case the checks in the ->wbinvd_intercept() handlers
> would suffice (as they did until now).
> 
> --- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
> +++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
> @@ -382,10 +382,13 @@ static int fuzz_invlpg(
>       return maybe_fail(ctxt, "invlpg", false);
>   }
>   
> -static int fuzz_wbinvd(
> +static int fuzz_cache_op(
> +    enum x86emul_cache_op op,
> +    enum x86_segment seg,
> +    unsigned long offset,
>       struct x86_emulate_ctxt *ctxt)
>   {
> -    return maybe_fail(ctxt, "wbinvd", true);
> +    return maybe_fail(ctxt, "cache-management", true);
>   }
>   
>   static int fuzz_write_io(
> @@ -620,7 +623,7 @@ static const struct x86_emulate_ops all_
>       SET(read_xcr),
>       SET(read_msr),
>       SET(write_msr),
> -    SET(wbinvd),
> +    SET(cache_op),
>       SET(invlpg),
>       .get_fpu    = emul_test_get_fpu,
>       .put_fpu    = emul_test_put_fpu,
> @@ -729,7 +732,7 @@ enum {
>       HOOK_read_xcr,
>       HOOK_read_msr,
>       HOOK_write_msr,
> -    HOOK_wbinvd,
> +    HOOK_cache_op,
>       HOOK_cpuid,
>       HOOK_inject_hw_exception,
>       HOOK_inject_sw_interrupt,
> @@ -773,7 +776,7 @@ static void disable_hooks(struct x86_emu
>       MAYBE_DISABLE_HOOK(read_xcr);
>       MAYBE_DISABLE_HOOK(read_msr);
>       MAYBE_DISABLE_HOOK(write_msr);
> -    MAYBE_DISABLE_HOOK(wbinvd);
> +    MAYBE_DISABLE_HOOK(cache_op);
>       MAYBE_DISABLE_HOOK(cpuid);
>       MAYBE_DISABLE_HOOK(get_fpu);
>       MAYBE_DISABLE_HOOK(invlpg);
> --- a/xen/arch/x86/Rules.mk
> +++ b/xen/arch/x86/Rules.mk
> @@ -19,7 +19,9 @@ $(call as-option-add,CFLAGS,CC,"crc32 %e
>   $(call as-option-add,CFLAGS,CC,"invept (%rax)$$(comma)%rax",-DHAVE_AS_EPT)
>   $(call as-option-add,CFLAGS,CC,"rdrand %eax",-DHAVE_AS_RDRAND)
>   $(call as-option-add,CFLAGS,CC,"rdfsbase %rax",-DHAVE_AS_FSGSBASE)
> +$(call as-option-add,CFLAGS,CC,"xsaveopt (%rax)",-DHAVE_AS_XSAVEOPT)
>   $(call as-option-add,CFLAGS,CC,"rdseed %eax",-DHAVE_AS_RDSEED)
> +$(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
>   $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1", \
>                        -U__OBJECT_LABEL__ -DHAVE_AS_QUOTED_SYM \
>                        '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$$@')
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -25,6 +25,7 @@
>   #include <asm/hvm/trace.h>
>   #include <asm/hvm/support.h>
>   #include <asm/hvm/svm/svm.h>
> +#include <asm/iocap.h>
>   #include <asm/vm_event.h>
>   
>   static void hvmtrace_io_assist(const ioreq_t *p)
> @@ -555,16 +556,12 @@ static void *hvmemul_map_linear_addr(
>       mfn_t *mfn = &hvmemul_ctxt->mfn[0];
>   
>       /*
> -     * The caller has no legitimate reason for trying a zero-byte write, but
> -     * all other code here is written to work if the check below was dropped.
> -     *
> -     * The maximum write size depends on the number of adjacent mfns[] which
> +     * The maximum access size depends on the number of adjacent mfns[] which
>        * can be vmap()'d, accouting for possible misalignment within the region.
>        * The higher level emulation callers are responsible for ensuring that
> -     * mfns[] is large enough for the requested write size.
> +     * mfns[] is large enough for the requested access size.
>        */
> -    if ( bytes == 0 ||
> -         nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
> +    if ( nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
>       {
>           ASSERT_UNREACHABLE();
>           goto unhandleable;
> @@ -669,8 +666,6 @@ static void hvmemul_unmap_linear_addr(
>       unsigned int i;
>       mfn_t *mfn = &hvmemul_ctxt->mfn[0];
>   
> -    ASSERT(bytes > 0);
> -
>       if ( nr_frames == 1 )
>           unmap_domain_page(mapping);
>       else
> @@ -1473,7 +1468,10 @@ static int hvmemul_write_msr_discard(
>       return X86EMUL_OKAY;
>   }
>   
> -static int hvmemul_wbinvd_discard(
> +static int hvmemul_cache_op_discard(
> +    enum x86emul_cache_op op,
> +    enum x86_segment seg,
> +    unsigned long offset,
>       struct x86_emulate_ctxt *ctxt)
>   {
>       return X86EMUL_OKAY;
> @@ -2149,10 +2147,65 @@ static int hvmemul_write_msr(
>       return rc;
>   }
>   
> -static int hvmemul_wbinvd(
> +static int hvmemul_cache_op(
> +    enum x86emul_cache_op op,
> +    enum x86_segment seg,
> +    unsigned long offset,
>       struct x86_emulate_ctxt *ctxt)
>   {
> -    alternative_vcall(hvm_funcs.wbinvd_intercept);
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present;
> +    int rc;
> +    void *mapping;
> +
> +    if ( !cache_flush_permitted(current->domain) )
> +        return X86EMUL_OKAY;
> +
> +    switch ( op )
> +    {
> +    case x86emul_clflush:
> +    case x86emul_clflushopt:
> +    case x86emul_clwb:
> +        ASSERT(!is_x86_system_segment(seg));
> +
> +        rc = hvmemul_virtual_to_linear(seg, offset, 0, &reps,
> +                                       hvm_access_read, hvmemul_ctxt, &addr);
> +        if ( rc != X86EMUL_OKAY )
> +            break;
> +
> +        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +            pfec |= PFEC_user_mode;
> +
> +        mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt,
> +                                          current->arch.hvm.data_cache);
> +        if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
> +            return X86EMUL_EXCEPTION;
> +        if ( IS_ERR_OR_NULL(mapping) )
> +            break;
> +
> +        if ( cpu_has_clflush )
> +        {
> +            if ( op == x86emul_clwb && cpu_has_clwb )
> +                clwb(mapping);
> +            else if ( op == x86emul_clflushopt && cpu_has_clflushopt )
> +                clflushopt(mapping);
> +            else
> +                clflush(mapping);
> +
> +            hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> +            break;
> +        }
> +
> +        hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
> +        /* fall through */
> +    case x86emul_invd:
> +    case x86emul_wbinvd:
> +        alternative_vcall(hvm_funcs.wbinvd_intercept);
> +        break;
> +    }
> +
>       return X86EMUL_OKAY;
>   }
>   
> @@ -2353,7 +2406,7 @@ static const struct x86_emulate_ops hvm_
>       .write_xcr     = hvmemul_write_xcr,
>       .read_msr      = hvmemul_read_msr,
>       .write_msr     = hvmemul_write_msr,
> -    .wbinvd        = hvmemul_wbinvd,
> +    .cache_op      = hvmemul_cache_op,
>       .cpuid         = x86emul_cpuid,
>       .get_fpu       = hvmemul_get_fpu,
>       .put_fpu       = hvmemul_put_fpu,
> @@ -2380,7 +2433,7 @@ static const struct x86_emulate_ops hvm_
>       .write_xcr     = hvmemul_write_xcr,
>       .read_msr      = hvmemul_read_msr,
>       .write_msr     = hvmemul_write_msr_discard,
> -    .wbinvd        = hvmemul_wbinvd_discard,
> +    .cache_op      = hvmemul_cache_op_discard,
>       .cpuid         = x86emul_cpuid,
>       .get_fpu       = hvmemul_get_fpu,
>       .put_fpu       = hvmemul_put_fpu,
> --- a/xen/arch/x86/pv/emul-priv-op.c
> +++ b/xen/arch/x86/pv/emul-priv-op.c
> @@ -1118,9 +1118,11 @@ static int write_msr(unsigned int reg, u
>       return X86EMUL_UNHANDLEABLE;
>   }
>   
> -/* Name it differently to avoid clashing with wbinvd() */
> -static int _wbinvd(struct x86_emulate_ctxt *ctxt)
> +static int cache_op(enum x86emul_cache_op op, enum x86_segment seg,
> +                    unsigned long offset, struct x86_emulate_ctxt *ctxt)
>   {
> +    ASSERT(op == x86emul_wbinvd);
> +
>       /* Ignore the instruction if unprivileged. */
>       if ( !cache_flush_permitted(current->domain) )
>           /*
> @@ -1238,7 +1240,7 @@ static const struct x86_emulate_ops priv
>       .read_msr            = read_msr,
>       .write_msr           = write_msr,
>       .cpuid               = x86emul_cpuid,
> -    .wbinvd              = _wbinvd,
> +    .cache_op            = cache_op,
>   };
>   
>   int pv_emulate_privileged_op(struct cpu_user_regs *regs)
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -5933,8 +5933,11 @@ x86_emulate(
>       case X86EMUL_OPC(0x0f, 0x08): /* invd */
>       case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
>           generate_exception_if(!mode_ring0(), EXC_GP, 0);
> -        fail_if(ops->wbinvd == NULL);
> -        if ( (rc = ops->wbinvd(ctxt)) != 0 )
> +        fail_if(!ops->cache_op);
> +        if ( (rc = ops->cache_op(b == 0x09 ? x86emul_wbinvd
> +                                           : x86emul_invd,
> +                                 x86_seg_none, 0,
> +                                 ctxt)) != X86EMUL_OKAY )
>               goto done;
>           break;
>   
> @@ -7801,8 +7804,9 @@ x86_emulate(
>               /* else clwb */
>               fail_if(!vex.pfx);
>               vcpu_must_have(clwb);
> -            fail_if(!ops->wbinvd);
> -            if ( (rc = ops->wbinvd(ctxt)) != X86EMUL_OKAY )
> +            fail_if(!ops->cache_op);
> +            if ( (rc = ops->cache_op(x86emul_clwb, ea.mem.seg, ea.mem.off,
> +                                     ctxt)) != X86EMUL_OKAY )
>                   goto done;
>               break;
>           case 7:
> @@ -7818,8 +7822,11 @@ x86_emulate(
>                   vcpu_must_have(clflush);
>               else
>                   vcpu_must_have(clflushopt);
> -            fail_if(ops->wbinvd == NULL);
> -            if ( (rc = ops->wbinvd(ctxt)) != 0 )
> +            fail_if(!ops->cache_op);
> +            if ( (rc = ops->cache_op(vex.pfx ? x86emul_clflushopt
> +                                             : x86emul_clflush,
> +                                     ea.mem.seg, ea.mem.off,
> +                                     ctxt)) != X86EMUL_OKAY )
>                   goto done;
>               break;
>           default:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -176,6 +176,14 @@ enum x86_emulate_fpu_type {
>       X86EMUL_FPU_none
>   };
>   
> +enum x86emul_cache_op {
> +    x86emul_clflush,
> +    x86emul_clflushopt,
> +    x86emul_clwb,
> +    x86emul_invd,
> +    x86emul_wbinvd,
> +};
> +
>   struct x86_emulate_state;
>   
>   /*
> @@ -452,8 +460,15 @@ struct x86_emulate_ops
>           uint64_t val,
>           struct x86_emulate_ctxt *ctxt);
>   
> -    /* wbinvd: Write-back and invalidate cache contents. */
> -    int (*wbinvd)(
> +    /*
> +     * cache_op: Write-back and/or invalidate cache contents.
> +     *
> +     * @seg:@offset applicable only to some of enum x86emul_cache_op.
> +     */
> +    int (*cache_op)(
> +        enum x86emul_cache_op op,
> +        enum x86_segment seg,
> +        unsigned long offset,
>           struct x86_emulate_ctxt *ctxt);
>   
>       /* cpuid: Emulate CPUID via given set of EAX-EDX inputs/outputs. */
> --- a/xen/include/asm-x86/cpufeature.h
> +++ b/xen/include/asm-x86/cpufeature.h
> @@ -102,6 +102,8 @@
>   #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
>   #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
>   #define cpu_has_avx512_ifma     boot_cpu_has(X86_FEATURE_AVX512_IFMA)
> +#define cpu_has_clflushopt      boot_cpu_has(X86_FEATURE_CLFLUSHOPT)
> +#define cpu_has_clwb            boot_cpu_has(X86_FEATURE_CLWB)
>   #define cpu_has_avx512er        boot_cpu_has(X86_FEATURE_AVX512ER)
>   #define cpu_has_avx512cd        boot_cpu_has(X86_FEATURE_AVX512CD)
>   #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
> --- a/xen/include/asm-x86/system.h
> +++ b/xen/include/asm-x86/system.h
> @@ -21,6 +21,23 @@ static inline void clflush(const void *p
>       asm volatile ( "clflush %0" :: "m" (*(const char *)p) );
>   }
>   
> +static inline void clflushopt(const void *p)
> +{
> +    asm volatile ( "data16 clflush %0" :: "m" (*(const char *)p) );
> +}
> +
> +static inline void clwb(const void *p)
> +{
> +#if defined(HAVE_AS_CLWB)
> +    asm volatile ( "clwb %0" :: "m" (*(const char *)p) );
> +#elif defined(HAVE_AS_XSAVEOPT)
> +    asm volatile ( "data16 xsaveopt %0" :: "m" (*(const char *)p) );
> +#else
> +    asm volatile ( ".byte 0x66, 0x0f, 0xae, 0x32"
> +                   :: "d" (p), "m" (*(const char *)p) );
> +#endif
> +}
> +
>   #define xchg(ptr,v) \
>       ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
>   
>
Paul Durrant Sept. 2, 2019, 12:04 p.m. UTC | #8
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 02 September 2019 12:10
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; Andrew Cooper <Andrew.Cooper3@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Wei Liu <wl@xen.org>
> Subject: Ping: [PATCH 1/6] x86emul: generalize wbinvd() hook
> 
> On 01.07.2019 13:55, Jan Beulich wrote:
> > The hook is already in use for other purposes, and emulating e.g.
> > CLFLUSH by issuing WBINVD is, well, not very nice. Rename the hook and
> > add parameters. Use lighter weight flushing insns when possible in
> > hvmemul_cache_op().
> >
> > hvmemul_cache_op() treating x86emul_invd the same as x86emul_wbinvd is
> > to retain original behavior, but I'm not sure this is what we want in
> > the long run.
> >
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Paul,
> 
> any chance I could get your ack (or otherwise) here? I thought I did
> answer the one question you had raised to your satisfaction.
> 

Yes, you did. Sorry for not acking earlier.

Acked-by: Paul Durrant <paul.durrant@citrix.com>
diff mbox series

Patch

--- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
@@ -382,10 +382,13 @@  static int fuzz_invlpg(
      return maybe_fail(ctxt, "invlpg", false);
  }
  
-static int fuzz_wbinvd(
+static int fuzz_cache_op(
+    enum x86emul_cache_op op,
+    enum x86_segment seg,
+    unsigned long offset,
      struct x86_emulate_ctxt *ctxt)
  {
-    return maybe_fail(ctxt, "wbinvd", true);
+    return maybe_fail(ctxt, "cache-management", true);
  }
  
  static int fuzz_write_io(
@@ -620,7 +623,7 @@  static const struct x86_emulate_ops all_
      SET(read_xcr),
      SET(read_msr),
      SET(write_msr),
-    SET(wbinvd),
+    SET(cache_op),
      SET(invlpg),
      .get_fpu    = emul_test_get_fpu,
      .put_fpu    = emul_test_put_fpu,
@@ -729,7 +732,7 @@  enum {
      HOOK_read_xcr,
      HOOK_read_msr,
      HOOK_write_msr,
-    HOOK_wbinvd,
+    HOOK_cache_op,
      HOOK_cpuid,
      HOOK_inject_hw_exception,
      HOOK_inject_sw_interrupt,
@@ -773,7 +776,7 @@  static void disable_hooks(struct x86_emu
      MAYBE_DISABLE_HOOK(read_xcr);
      MAYBE_DISABLE_HOOK(read_msr);
      MAYBE_DISABLE_HOOK(write_msr);
-    MAYBE_DISABLE_HOOK(wbinvd);
+    MAYBE_DISABLE_HOOK(cache_op);
      MAYBE_DISABLE_HOOK(cpuid);
      MAYBE_DISABLE_HOOK(get_fpu);
      MAYBE_DISABLE_HOOK(invlpg);
--- a/xen/arch/x86/Rules.mk
+++ b/xen/arch/x86/Rules.mk
@@ -19,7 +19,9 @@  $(call as-option-add,CFLAGS,CC,"crc32 %e
  $(call as-option-add,CFLAGS,CC,"invept (%rax)$$(comma)%rax",-DHAVE_AS_EPT)
  $(call as-option-add,CFLAGS,CC,"rdrand %eax",-DHAVE_AS_RDRAND)
  $(call as-option-add,CFLAGS,CC,"rdfsbase %rax",-DHAVE_AS_FSGSBASE)
+$(call as-option-add,CFLAGS,CC,"xsaveopt (%rax)",-DHAVE_AS_XSAVEOPT)
  $(call as-option-add,CFLAGS,CC,"rdseed %eax",-DHAVE_AS_RDSEED)
+$(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
  $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1", \
                       -U__OBJECT_LABEL__ -DHAVE_AS_QUOTED_SYM \
                       '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$$@')
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -25,6 +25,7 @@ 
  #include <asm/hvm/trace.h>
  #include <asm/hvm/support.h>
  #include <asm/hvm/svm/svm.h>
+#include <asm/iocap.h>
  #include <asm/vm_event.h>
  
  static void hvmtrace_io_assist(const ioreq_t *p)
@@ -555,16 +556,12 @@  static void *hvmemul_map_linear_addr(
      mfn_t *mfn = &hvmemul_ctxt->mfn[0];
  
      /*
-     * The caller has no legitimate reason for trying a zero-byte write, but
-     * all other code here is written to work if the check below was dropped.
-     *
-     * The maximum write size depends on the number of adjacent mfns[] which
+     * The maximum access size depends on the number of adjacent mfns[] which
       * can be vmap()'d, accouting for possible misalignment within the region.
       * The higher level emulation callers are responsible for ensuring that
-     * mfns[] is large enough for the requested write size.
+     * mfns[] is large enough for the requested access size.
       */
-    if ( bytes == 0 ||
-         nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
+    if ( nr_frames > ARRAY_SIZE(hvmemul_ctxt->mfn) )
      {
          ASSERT_UNREACHABLE();
          goto unhandleable;
@@ -669,8 +666,6 @@  static void hvmemul_unmap_linear_addr(
      unsigned int i;
      mfn_t *mfn = &hvmemul_ctxt->mfn[0];
  
-    ASSERT(bytes > 0);
-
      if ( nr_frames == 1 )
          unmap_domain_page(mapping);
      else
@@ -1473,7 +1468,10 @@  static int hvmemul_write_msr_discard(
      return X86EMUL_OKAY;
  }
  
-static int hvmemul_wbinvd_discard(
+static int hvmemul_cache_op_discard(
+    enum x86emul_cache_op op,
+    enum x86_segment seg,
+    unsigned long offset,
      struct x86_emulate_ctxt *ctxt)
  {
      return X86EMUL_OKAY;
@@ -2149,10 +2147,65 @@  static int hvmemul_write_msr(
      return rc;
  }
  
-static int hvmemul_wbinvd(
+static int hvmemul_cache_op(
+    enum x86emul_cache_op op,
+    enum x86_segment seg,
+    unsigned long offset,
      struct x86_emulate_ctxt *ctxt)
  {
-    alternative_vcall(hvm_funcs.wbinvd_intercept);
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    unsigned long addr, reps = 1;
+    uint32_t pfec = PFEC_page_present;
+    int rc;
+    void *mapping;
+
+    if ( !cache_flush_permitted(current->domain) )
+        return X86EMUL_OKAY;
+
+    switch ( op )
+    {
+    case x86emul_clflush:
+    case x86emul_clflushopt:
+    case x86emul_clwb:
+        ASSERT(!is_x86_system_segment(seg));
+
+        rc = hvmemul_virtual_to_linear(seg, offset, 0, &reps,
+                                       hvm_access_read, hvmemul_ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            break;
+
+        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+            pfec |= PFEC_user_mode;
+
+        mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt,
+                                          current->arch.hvm.data_cache);
+        if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
+            return X86EMUL_EXCEPTION;
+        if ( IS_ERR_OR_NULL(mapping) )
+            break;
+
+        if ( cpu_has_clflush )
+        {
+            if ( op == x86emul_clwb && cpu_has_clwb )
+                clwb(mapping);
+            else if ( op == x86emul_clflushopt && cpu_has_clflushopt )
+                clflushopt(mapping);
+            else
+                clflush(mapping);
+
+            hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
+            break;
+        }
+
+        hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
+        /* fall through */
+    case x86emul_invd:
+    case x86emul_wbinvd:
+        alternative_vcall(hvm_funcs.wbinvd_intercept);
+        break;
+    }
+
      return X86EMUL_OKAY;
  }
  
@@ -2353,7 +2406,7 @@  static const struct x86_emulate_ops hvm_
      .write_xcr     = hvmemul_write_xcr,
      .read_msr      = hvmemul_read_msr,
      .write_msr     = hvmemul_write_msr,
-    .wbinvd        = hvmemul_wbinvd,
+    .cache_op      = hvmemul_cache_op,
      .cpuid         = x86emul_cpuid,
      .get_fpu       = hvmemul_get_fpu,
      .put_fpu       = hvmemul_put_fpu,
@@ -2380,7 +2433,7 @@  static const struct x86_emulate_ops hvm_
      .write_xcr     = hvmemul_write_xcr,
      .read_msr      = hvmemul_read_msr,
      .write_msr     = hvmemul_write_msr_discard,
-    .wbinvd        = hvmemul_wbinvd_discard,
+    .cache_op      = hvmemul_cache_op_discard,
      .cpuid         = x86emul_cpuid,
      .get_fpu       = hvmemul_get_fpu,
      .put_fpu       = hvmemul_put_fpu,
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -1118,9 +1118,11 @@  static int write_msr(unsigned int reg, u
      return X86EMUL_UNHANDLEABLE;
  }
  
-/* Name it differently to avoid clashing with wbinvd() */
-static int _wbinvd(struct x86_emulate_ctxt *ctxt)
+static int cache_op(enum x86emul_cache_op op, enum x86_segment seg,
+                    unsigned long offset, struct x86_emulate_ctxt *ctxt)
  {
+    ASSERT(op == x86emul_wbinvd);
+
      /* Ignore the instruction if unprivileged. */
      if ( !cache_flush_permitted(current->domain) )
          /*
@@ -1238,7 +1240,7 @@  static const struct x86_emulate_ops priv
      .read_msr            = read_msr,
      .write_msr           = write_msr,
      .cpuid               = x86emul_cpuid,
-    .wbinvd              = _wbinvd,
+    .cache_op            = cache_op,
  };
  
  int pv_emulate_privileged_op(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5933,8 +5933,11 @@  x86_emulate(
      case X86EMUL_OPC(0x0f, 0x08): /* invd */
      case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
          generate_exception_if(!mode_ring0(), EXC_GP, 0);
-        fail_if(ops->wbinvd == NULL);
-        if ( (rc = ops->wbinvd(ctxt)) != 0 )
+        fail_if(!ops->cache_op);
+        if ( (rc = ops->cache_op(b == 0x09 ? x86emul_wbinvd
+                                           : x86emul_invd,
+                                 x86_seg_none, 0,
+                                 ctxt)) != X86EMUL_OKAY )
              goto done;
          break;
  
@@ -7801,8 +7804,9 @@  x86_emulate(
              /* else clwb */
              fail_if(!vex.pfx);
              vcpu_must_have(clwb);
-            fail_if(!ops->wbinvd);
-            if ( (rc = ops->wbinvd(ctxt)) != X86EMUL_OKAY )
+            fail_if(!ops->cache_op);
+            if ( (rc = ops->cache_op(x86emul_clwb, ea.mem.seg, ea.mem.off,
+                                     ctxt)) != X86EMUL_OKAY )
                  goto done;
              break;
          case 7:
@@ -7818,8 +7822,11 @@  x86_emulate(
                  vcpu_must_have(clflush);
              else
                  vcpu_must_have(clflushopt);
-            fail_if(ops->wbinvd == NULL);
-            if ( (rc = ops->wbinvd(ctxt)) != 0 )
+            fail_if(!ops->cache_op);
+            if ( (rc = ops->cache_op(vex.pfx ? x86emul_clflushopt
+                                             : x86emul_clflush,
+                                     ea.mem.seg, ea.mem.off,
+                                     ctxt)) != X86EMUL_OKAY )
                  goto done;
              break;
          default:
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -176,6 +176,14 @@  enum x86_emulate_fpu_type {
      X86EMUL_FPU_none
  };
  
+enum x86emul_cache_op {
+    x86emul_clflush,
+    x86emul_clflushopt,
+    x86emul_clwb,
+    x86emul_invd,
+    x86emul_wbinvd,
+};
+
  struct x86_emulate_state;
  
  /*
@@ -452,8 +460,15 @@  struct x86_emulate_ops
          uint64_t val,
          struct x86_emulate_ctxt *ctxt);
  
-    /* wbinvd: Write-back and invalidate cache contents. */
-    int (*wbinvd)(
+    /*
+     * cache_op: Write-back and/or invalidate cache contents.
+     *
+     * @seg:@offset applicable only to some of enum x86emul_cache_op.
+     */
+    int (*cache_op)(
+        enum x86emul_cache_op op,
+        enum x86_segment seg,
+        unsigned long offset,
          struct x86_emulate_ctxt *ctxt);
  
      /* cpuid: Emulate CPUID via given set of EAX-EDX inputs/outputs. */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -102,6 +102,8 @@ 
  #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
  #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
  #define cpu_has_avx512_ifma     boot_cpu_has(X86_FEATURE_AVX512_IFMA)
+#define cpu_has_clflushopt      boot_cpu_has(X86_FEATURE_CLFLUSHOPT)
+#define cpu_has_clwb            boot_cpu_has(X86_FEATURE_CLWB)
  #define cpu_has_avx512er        boot_cpu_has(X86_FEATURE_AVX512ER)
  #define cpu_has_avx512cd        boot_cpu_has(X86_FEATURE_AVX512CD)
  #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -21,6 +21,23 @@  static inline void clflush(const void *p
      asm volatile ( "clflush %0" :: "m" (*(const char *)p) );
  }
  
+static inline void clflushopt(const void *p)
+{
+    asm volatile ( "data16 clflush %0" :: "m" (*(const char *)p) );
+}
+
+static inline void clwb(const void *p)
+{
+#if defined(HAVE_AS_CLWB)
+    asm volatile ( "clwb %0" :: "m" (*(const char *)p) );
+#elif defined(HAVE_AS_XSAVEOPT)
+    asm volatile ( "data16 xsaveopt %0" :: "m" (*(const char *)p) );
+#else
+    asm volatile ( ".byte 0x66, 0x0f, 0xae, 0x32"
+                   :: "d" (p), "m" (*(const char *)p) );
+#endif
+}
+
  #define xchg(ptr,v) \
      ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))