diff mbox

add sysenter/syscall emulation for 32bit compat mode

Message ID 1245158713-15054-1-git-send-email-andre.przywara@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara June 16, 2009, 1:25 p.m. UTC
sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas
syscall is not supported on Intel's 32bit compat mode. To allow cross
vendor migration we emulate the missing instructions by setting up the
processor state accordingly.
The sysenter code was originally sketched by Amit Shah, it was completed,
debugged,  syscall added and made-to-work by Christoph Egger and polished
up by Andre Przywara.
Please note that sysret does not need to be emulated, because it will be
exectued in 64bit mode and returning to 32bit compat mode works on Intel.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Christoph Egger <christoph.egger@amd.com>
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 arch/x86/kvm/x86.c         |   37 ++++++-
 arch/x86/kvm/x86_emulate.c |  228 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 259 insertions(+), 6 deletions(-)

This is a reworked version of my earlier patch. I moved the former three
case: parts into one separate function for better readability and 
understanding.
Please apply!

Thanks,
Andre.

Comments

Amit Shah June 17, 2009, 5:02 a.m. UTC | #1
Hi Andre,

On (Tue) Jun 16 2009 [15:25:13], Andre Przywara wrote:
> sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas
> syscall is not supported on Intel's 32bit compat mode. To allow cross
> vendor migration we emulate the missing instructions by setting up the
> processor state accordingly.
> The sysenter code was originally sketched by Amit Shah, it was completed,
> debugged,  syscall added and made-to-work by Christoph Egger and polished
> up by Andre Przywara.
> Please note that sysret does not need to be emulated, because it will be
> exectued in 64bit mode and returning to 32bit compat mode works on Intel.

Thanks for picking this up. You also had a testcase for this, right?

		Amit
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 17, 2009, 7:24 a.m. UTC | #2
On 06/16/2009 04:25 PM, Andre Przywara wrote:
> sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas
> syscall is not supported on Intel's 32bit compat mode. To allow cross
> vendor migration we emulate the missing instructions by setting up the
> processor state accordingly.
> The sysenter code was originally sketched by Amit Shah, it was completed,
> debugged,  syscall added and made-to-work by Christoph Egger and polished
> up by Andre Przywara.
> Please note that sysret does not need to be emulated, because it will be
> exectued in 64bit mode and returning to 32bit compat mode works on Intel.
>
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6025e5b..9066fb3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2635,11 +2635,38 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
>   		/* Reject the instructions other than VMCALL/VMMCALL when
>   		 * try to emulate invalid opcode */
>   		c =&vcpu->arch.emulate_ctxt.decode;
> -		if ((emulation_type&  EMULTYPE_TRAP_UD)&&
> -		    (!(c->twobyte&&  c->b == 0x01&&
> -		      (c->modrm_reg == 0 || c->modrm_reg == 3)&&
> -		       c->modrm_mod == 3&&  c->modrm_rm == 1)))
> -			return EMULATE_FAIL;
> +
> +		if (emulation_type&  EMULTYPE_TRAP_UD) {
> +			if (!c->twobyte)
> +				return EMULATE_FAIL;
> +			switch (c->b) {
> +			case 0x01: /* VMMCALL */
> +				if (c->modrm_mod != 3)
> +					return EMULATE_FAIL;
> +				if (c->modrm_rm != 1)
> +					return EMULATE_FAIL;
> +				break;
> +			case 0x34: /* sysenter */
> +			case 0x35: /* sysexit */
> +				if (c->modrm_mod != 0)
> +					return EMULATE_FAIL;
> +				if (c->modrm_rm != 0)
> +					return EMULATE_FAIL;
> +				break;
> +			case 0x05: /* syscall */
> +				r = 0;
> +				if (c->modrm_mod != 0)
> +					return EMULATE_FAIL;
> +				if (c->modrm_rm != 0)
> +					return EMULATE_FAIL;
> +				break;
> +			default:
> +				return EMULATE_FAIL;
> +			}
> +
> +			if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
> +				return EMULATE_FAIL;
> +		}
>
>   		++vcpu->stat.insn_emulation;
>   		if (r)  {
>    

This would be better encoded in the opcode table.  No need to do it for 
this patch though.

> diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
> index 22c765d..6c65462 100644
> --- a/arch/x86/kvm/x86_emulate.c
> +++ b/arch/x86/kvm/x86_emulate.c
> @@ -32,6 +32,8 @@
>   #include<linux/module.h>
>   #include<asm/kvm_x86_emulate.h>
>
> +#include "mmu.h"
> +
>    

Still unneeded I presume.@@ -320,8 +324,11 @@ static u32 group2_table[] = {
>   };
>
>   /* EFLAGS bit definitions. */
> +#define EFLG_VM (1<<17)
> +#define EFLG_RF (1<<16)
>   #define EFLG_OF (1<<11)
>   #define EFLG_DF (1<<10)
> +#define EFLG_IF (1<<9)
>   #define EFLG_SF (1<<7)
>   #define EFLG_ZF (1<<6)
>   #define EFLG_AF (1<<4)
> @@ -1390,6 +1397,207 @@ void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
>   		ctxt->interruptibility = mask;
>   }
>
> +#define EMUL_SYSENTER 1
> +#define EMUL_SYSEXIT 2
> +#define EMUL_SYSCALL 3
> +
> +static int
> +emulate_syscalls(struct x86_emulate_ctxt *ctxt, int ins)
> +{
> +	struct decode_cache *c =&ctxt->decode;
> +	struct kvm_segment cs, ss;
> +	unsigned long cr0;
> +	u64 msr_data;
> +	int usermode;
> +
> +	/* inject #UD if LOCK prefix is used */
> +	if (c->lock_prefix)
> +		return -1;
>    

We should encode legality of the lock prefix in the opcode tables.  
Again unrelated to this patch.

> +
> +	/* inject #GP if in real mode or paging is disabled */
> +	cr0 = ctxt->vcpu->arch.cr0;
> +	if (ctxt->mode == X86EMUL_MODE_REAL || !(cr0&  X86_CR0_PE)) {
> +		if (ins == EMUL_SYSENTER || ins == EMUL_SYSEXIT)
> +			kvm_inject_gp(ctxt->vcpu, 0);
> +		return -1;
> +	}
> +
> +	/* XXX sysenter/sysexit have not been tested in 64bit mode.
> +	 * Therefore, we inject an #UD.
> +	 */
> +	if (ins == EMUL_SYSENTER&&  ctxt->mode == X86EMUL_MODE_PROT64)
> +		return -1;
>    

We should return EMULATE_FAIL rather than confusing the poor guest.

> +
> +	/* sysexit must be called from CPL 0 */
> +	if (ins == EMUL_SYSEXIT&&  kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
> +		kvm_inject_gp(ctxt->vcpu, 0);
> +		return -1;
> +	}
> +
> +	/* setup common descriptor content */
> +	memset(&cs, 0, sizeof(struct kvm_segment));
> +	memset(&ss, 0, sizeof(struct kvm_segment));
> +	kvm_x86_ops->get_segment(ctxt->vcpu,&cs, VCPU_SREG_CS);
> +
> +	cs.l = 0;       /* will be adjusted later */
> +	cs.base = 0;        /* flat segment */
> +	cs.g = 1;       /* 4kb granularity */
> +	cs.limit = 0xfffff; /* 4GB limit */
> +	cs.type = 0x0b;     /* Read, Execute, Accessed */
> +	cs.s = 1;
> +	cs.dpl = 0;     /* will be adjusted later */
> +	cs.present = 1;
> +	cs.db = 1;
> +
> +	ss.unusable = 0;
> +	ss.base = 0;        /* flat segment */
> +	ss.limit = 0xfffff; /* 4GB limit */
> +	ss.g = 1;       /* 4kb granularity */
> +	ss.s = 1;
> +	ss.type = 0x03;     /* Read/Write, Accessed */
> +	ss.db = 1;      /* 32bit stack segment */
> +	ss.dpl = 0;
> +	ss.present = 1;
> +
> +	if (ins == EMUL_SYSEXIT) {
> +		if ((c->rex_prefix&  0x8) != 0x0)
> +			usermode = X86EMUL_MODE_PROT64;
> +		else
> +			usermode = X86EMUL_MODE_PROT32;
> +	} else {
> +		usermode = ctxt->mode;
> +	}
> +
> +	/* now fixup the segment descriptor for each specific instruction */
> +	switch (ins) {
> +	case EMUL_SYSCALL:
> +		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data);
> +		msr_data>>= 32;
> +		cs.selector = (u16)(msr_data&  0xfffc);
> +		ss.selector = (u16)(msr_data + 8);
> +		if (is_long_mode(ctxt->vcpu)) {
> +			cs.db = 0;
> +			cs.l = 1;
> +			if (usermode == X86EMUL_MODE_PROT64) {
> +				/* Intel cares about granularity (g bit),
> +				 * so we don't set the effective limit.
> +				 */
> +				cs.g = 1;
> +				cs.limit = 0xffffffff;
> +			}
> +		}
> +		break;
> +	case EMUL_SYSENTER:
> +		kvm_x86_ops->get_msr(ctxt->vcpu,
> +			MSR_IA32_SYSENTER_CS,&msr_data);
> +		switch (usermode) {
> +		case X86EMUL_MODE_PROT32:
> +			if ((msr_data&  0xfffc) == 0x0) {
> +				kvm_inject_gp(ctxt->vcpu, 0);
> +				return -1;
> +			}
> +			break;
> +		case X86EMUL_MODE_PROT64:
> +			if (msr_data == 0x0) {
> +				kvm_inject_gp(ctxt->vcpu, 0);
> +				return -1;
> +			}
> +			break;
> +		}
> +		ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF);
> +		cs.selector = (u16)msr_data;
> +		cs.selector&= ~SELECTOR_RPL_MASK;
> +		ss.selector = cs.selector + 8;
> +		ss.selector&= ~SELECTOR_RPL_MASK;
> +		if (usermode == X86EMUL_MODE_PROT64
> +			|| is_long_mode(ctxt->vcpu)) {
> +			cs.db = 0;
> +			cs.l = 1;
> +			cs.limit = 0xffffffff;
> +			ss.limit = 0xffffffff;
> +		}
> +		break;
> +	case EMUL_SYSEXIT:
> +		/* We don't care about cs.g/ss.g bits
> +		 * (= 4kb granularity) so we have to set the effective
> +		 * limit here or we get a #GP in the guest, otherwise.
> +		 */
> +		cs.limit = 0xffffffff;
> +		ss.limit = 0xffffffff;
> +		cs.dpl = 3;
> +		ss.dpl = 3;
> +		kvm_x86_ops->get_msr(ctxt->vcpu,
> +			MSR_IA32_SYSENTER_CS,&msr_data);
> +		switch (usermode) {
> +		case X86EMUL_MODE_PROT32:
> +			cs.selector = (u16)(msr_data + 16);
> +			if ((msr_data&  0xfffc) == 0x0) {
> +				kvm_inject_gp(ctxt->vcpu, 0);
> +				return -1;
> +			}
> +			ss.selector = (u16)(msr_data + 24);
> +			break;
> +		case X86EMUL_MODE_PROT64:
> +			cs.selector = (u16)(msr_data + 32);
> +			if (msr_data == 0x0) {
> +				kvm_inject_gp(ctxt->vcpu, 0);
> +				return -1;
> +			}
> +			ss.selector = cs.selector + 8;
> +			cs.db = 0;
> +			cs.l = 1;
> +			break;
> +		}
> +		cs.selector |= SELECTOR_RPL_MASK;
> +		ss.selector |= SELECTOR_RPL_MASK;
> +		break;
> +	}
> +
> +	kvm_x86_ops->set_segment(ctxt->vcpu,&cs, VCPU_SREG_CS);
> +	kvm_x86_ops->set_segment(ctxt->vcpu,&ss, VCPU_SREG_SS);
> +
> +	switch (ins) {
> +	case EMUL_SYSCALL:
> +		c->regs[VCPU_REGS_RCX] = c->eip;
> +		if (is_long_mode(ctxt->vcpu)) {
> +			c->regs[VCPU_REGS_R11] = ctxt->eflags&  ~EFLG_RF;
> +
> +			kvm_x86_ops->get_msr(ctxt->vcpu,
> +				usermode == X86EMUL_MODE_PROT64 ?
> +				MSR_LSTAR : MSR_CSTAR,
> +				&msr_data);
> +			c->eip = msr_data;
> +
> +			kvm_x86_ops->get_msr(ctxt->vcpu,
> +				MSR_SYSCALL_MASK,&msr_data);
> +			ctxt->eflags&= ~(msr_data | EFLG_RF);
> +		} else {
> +			/* legacy mode */
> +			kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data);
> +			c->eip = (u32)msr_data;
> +
> +			ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF);
> +		}
> +		break;
> +	case EMUL_SYSENTER:
> +		kvm_x86_ops->get_msr(ctxt->vcpu,
> +			MSR_IA32_SYSENTER_EIP,&msr_data);
> +		c->eip = msr_data;
> +
> +		kvm_x86_ops->get_msr(ctxt->vcpu,
> +			MSR_IA32_SYSENTER_ESP,&msr_data);
> +		c->regs[VCPU_REGS_RSP] = msr_data;
> +		break;
> +
> +	case EMUL_SYSEXIT:
> +		c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
> +		c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
> +		break;
> +	}
> +
> +	return 0;
> +}
>    

This function is way to big.  Please split it up to one function per 
instruction, and use a helper function for common code.
Andre Przywara June 17, 2009, 7:51 a.m. UTC | #3
Amit Shah wrote:
> Hi Andre,
> 
> On (Tue) Jun 16 2009 [15:25:13], Andre Przywara wrote:
>> sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas
>> syscall is not supported on Intel's 32bit compat mode. To allow cross
>> vendor migration we emulate the missing instructions by setting up the
>> processor state accordingly.
>> The sysenter code was originally sketched by Amit Shah, it was completed,
>> debugged,  syscall added and made-to-work by Christoph Egger and polished
>> up by Andre Przywara.
>> Please note that sysret does not need to be emulated, because it will be
>> exectued in 64bit mode and returning to 32bit compat mode works on Intel.
> 
> Thanks for picking this up. You also had a testcase for this, right?
Yes. We started a Linux 64bit guest on an AMD host and migrated it to an 
Intel box. On both boxes we ran a test benchmark which is measuring the 
time needed for a few million syscalls (GETPID via syscall()), both as a 
64bit and a 32bit binary. Then we repeated the test the other way round 
(started on Intel, migrated to AMD).
Without the patch the 32bit test was aborted with SIGILL. With the patch 
everything went well, excecpt that the execution time was much longer.
Earlier we observed that this shouldn't be much of an issue in real 
life, as a tight-loop execution of GETPID is a real corner case.

Regards,
Andre.
diff mbox

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6025e5b..9066fb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2635,11 +2635,38 @@  int emulate_instruction(struct kvm_vcpu *vcpu,
 		/* Reject the instructions other than VMCALL/VMMCALL when
 		 * try to emulate invalid opcode */
 		c = &vcpu->arch.emulate_ctxt.decode;
-		if ((emulation_type & EMULTYPE_TRAP_UD) &&
-		    (!(c->twobyte && c->b == 0x01 &&
-		      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
-		       c->modrm_mod == 3 && c->modrm_rm == 1)))
-			return EMULATE_FAIL;
+
+		if (emulation_type & EMULTYPE_TRAP_UD) {
+			if (!c->twobyte)
+				return EMULATE_FAIL;
+			switch (c->b) {
+			case 0x01: /* VMMCALL */
+				if (c->modrm_mod != 3)
+					return EMULATE_FAIL;
+				if (c->modrm_rm != 1)
+					return EMULATE_FAIL;
+				break;
+			case 0x34: /* sysenter */
+			case 0x35: /* sysexit */
+				if (c->modrm_mod != 0)
+					return EMULATE_FAIL;
+				if (c->modrm_rm != 0)
+					return EMULATE_FAIL;
+				break;
+			case 0x05: /* syscall */
+				r = 0;
+				if (c->modrm_mod != 0)
+					return EMULATE_FAIL;
+				if (c->modrm_rm != 0)
+					return EMULATE_FAIL;
+				break;
+			default:
+				return EMULATE_FAIL;
+			}
+
+			if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+				return EMULATE_FAIL;
+		}
 
 		++vcpu->stat.insn_emulation;
 		if (r)  {
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 22c765d..6c65462 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -32,6 +32,8 @@ 
 #include <linux/module.h>
 #include <asm/kvm_x86_emulate.h>
 
+#include "mmu.h"
+
 /*
  * Opcode effective-address decode tables.
  * Note that we only emulate instructions that have at least one memory
@@ -217,7 +219,9 @@  static u32 twobyte_table[256] = {
 	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x30 - 0x3F */
-	ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	ImplicitOps, 0, ImplicitOps, 0,
+	ImplicitOps, ImplicitOps, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x40 - 0x47 */
 	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -320,8 +324,11 @@  static u32 group2_table[] = {
 };
 
 /* EFLAGS bit definitions. */
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
 #define EFLG_OF (1<<11)
 #define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
 #define EFLG_SF (1<<7)
 #define EFLG_ZF (1<<6)
 #define EFLG_AF (1<<4)
@@ -1390,6 +1397,207 @@  void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
 		ctxt->interruptibility = mask;
 }
 
+#define EMUL_SYSENTER 1
+#define EMUL_SYSEXIT 2
+#define EMUL_SYSCALL 3
+
+static int
+emulate_syscalls(struct x86_emulate_ctxt *ctxt, int ins)
+{
+	struct decode_cache *c = &ctxt->decode;
+	struct kvm_segment cs, ss;
+	unsigned long cr0;
+	u64 msr_data;
+	int usermode;
+
+	/* inject #UD if LOCK prefix is used */
+	if (c->lock_prefix)
+		return -1;
+
+	/* inject #GP if in real mode or paging is disabled */
+	cr0 = ctxt->vcpu->arch.cr0;
+	if (ctxt->mode == X86EMUL_MODE_REAL || !(cr0 & X86_CR0_PE)) {
+		if (ins == EMUL_SYSENTER || ins == EMUL_SYSEXIT)
+			kvm_inject_gp(ctxt->vcpu, 0);
+		return -1;
+	}
+
+	/* XXX sysenter/sysexit have not been tested in 64bit mode.
+	 * Therefore, we inject an #UD.
+	 */
+	if (ins == EMUL_SYSENTER && ctxt->mode == X86EMUL_MODE_PROT64)
+		return -1;
+
+	/* sysexit must be called from CPL 0 */
+	if (ins == EMUL_SYSEXIT && kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+		kvm_inject_gp(ctxt->vcpu, 0);
+		return -1;
+	}
+
+	/* setup common descriptor content */
+	memset(&cs, 0, sizeof(struct kvm_segment));
+	memset(&ss, 0, sizeof(struct kvm_segment));
+	kvm_x86_ops->get_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+
+	cs.l = 0;       /* will be adjusted later */
+	cs.base = 0;        /* flat segment */
+	cs.g = 1;       /* 4kb granularity */
+	cs.limit = 0xfffff; /* 4GB limit */
+	cs.type = 0x0b;     /* Read, Execute, Accessed */
+	cs.s = 1;
+	cs.dpl = 0;     /* will be adjusted later */
+	cs.present = 1;
+	cs.db = 1;
+
+	ss.unusable = 0;
+	ss.base = 0;        /* flat segment */
+	ss.limit = 0xfffff; /* 4GB limit */
+	ss.g = 1;       /* 4kb granularity */
+	ss.s = 1;
+	ss.type = 0x03;     /* Read/Write, Accessed */
+	ss.db = 1;      /* 32bit stack segment */
+	ss.dpl = 0;
+	ss.present = 1;
+
+	if (ins == EMUL_SYSEXIT) {
+		if ((c->rex_prefix & 0x8) != 0x0)
+			usermode = X86EMUL_MODE_PROT64;
+		else
+			usermode = X86EMUL_MODE_PROT32;
+	} else {
+		usermode = ctxt->mode;
+	}
+
+	/* now fixup the segment descriptor for each specific instruction */
+	switch (ins) {
+	case EMUL_SYSCALL:
+		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+		msr_data >>= 32;
+		cs.selector = (u16)(msr_data & 0xfffc);
+		ss.selector = (u16)(msr_data + 8);
+		if (is_long_mode(ctxt->vcpu)) {
+			cs.db = 0;
+			cs.l = 1;
+			if (usermode == X86EMUL_MODE_PROT64) {
+				/* Intel cares about granularity (g bit),
+				 * so we don't set the effective limit.
+				 */
+				cs.g = 1;
+				cs.limit = 0xffffffff;
+			}
+		}
+		break;
+	case EMUL_SYSENTER:
+		kvm_x86_ops->get_msr(ctxt->vcpu,
+			MSR_IA32_SYSENTER_CS, &msr_data);
+		switch (usermode) {
+		case X86EMUL_MODE_PROT32:
+			if ((msr_data & 0xfffc) == 0x0) {
+				kvm_inject_gp(ctxt->vcpu, 0);
+				return -1;
+			}
+			break;
+		case X86EMUL_MODE_PROT64:
+			if (msr_data == 0x0) {
+				kvm_inject_gp(ctxt->vcpu, 0);
+				return -1;
+			}
+			break;
+		}
+		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+		cs.selector = (u16)msr_data;
+		cs.selector &= ~SELECTOR_RPL_MASK;
+		ss.selector = cs.selector + 8;
+		ss.selector &= ~SELECTOR_RPL_MASK;
+		if (usermode == X86EMUL_MODE_PROT64
+			|| is_long_mode(ctxt->vcpu)) {
+			cs.db = 0;
+			cs.l = 1;
+			cs.limit = 0xffffffff;
+			ss.limit = 0xffffffff;
+		}
+		break;
+	case EMUL_SYSEXIT:
+		/* We don't care about cs.g/ss.g bits
+		 * (= 4kb granularity) so we have to set the effective
+		 * limit here or we get a #GP in the guest, otherwise.
+		 */
+		cs.limit = 0xffffffff;
+		ss.limit = 0xffffffff;
+		cs.dpl = 3;
+		ss.dpl = 3;
+		kvm_x86_ops->get_msr(ctxt->vcpu,
+			MSR_IA32_SYSENTER_CS, &msr_data);
+		switch (usermode) {
+		case X86EMUL_MODE_PROT32:
+			cs.selector = (u16)(msr_data + 16);
+			if ((msr_data & 0xfffc) == 0x0) {
+				kvm_inject_gp(ctxt->vcpu, 0);
+				return -1;
+			}
+			ss.selector = (u16)(msr_data + 24);
+			break;
+		case X86EMUL_MODE_PROT64:
+			cs.selector = (u16)(msr_data + 32);
+			if (msr_data == 0x0) {
+				kvm_inject_gp(ctxt->vcpu, 0);
+				return -1;
+			}
+			ss.selector = cs.selector + 8;
+			cs.db = 0;
+			cs.l = 1;
+			break;
+		}
+		cs.selector |= SELECTOR_RPL_MASK;
+		ss.selector |= SELECTOR_RPL_MASK;
+		break;
+	}
+
+	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+	switch (ins) {
+	case EMUL_SYSCALL:
+		c->regs[VCPU_REGS_RCX] = c->eip;
+		if (is_long_mode(ctxt->vcpu)) {
+			c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+			kvm_x86_ops->get_msr(ctxt->vcpu,
+				usermode == X86EMUL_MODE_PROT64 ?
+				MSR_LSTAR : MSR_CSTAR,
+				&msr_data);
+			c->eip = msr_data;
+
+			kvm_x86_ops->get_msr(ctxt->vcpu,
+				MSR_SYSCALL_MASK, &msr_data);
+			ctxt->eflags &= ~(msr_data | EFLG_RF);
+		} else {
+			/* legacy mode */
+			kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+			c->eip = (u32)msr_data;
+
+			ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+		}
+		break;
+	case EMUL_SYSENTER:
+		kvm_x86_ops->get_msr(ctxt->vcpu,
+			MSR_IA32_SYSENTER_EIP, &msr_data);
+		c->eip = msr_data;
+
+		kvm_x86_ops->get_msr(ctxt->vcpu,
+			MSR_IA32_SYSENTER_ESP, &msr_data);
+		c->regs[VCPU_REGS_RSP] = msr_data;
+		break;
+
+	case EMUL_SYSEXIT:
+		c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+		c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+		break;
+	}
+
+	return 0;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -1985,6 +2193,12 @@  twobyte_insn:
 			goto cannot_emulate;
 		}
 		break;
+	case 0x05: /* syscall */
+		if (emulate_syscalls(ctxt, EMUL_SYSCALL) == -1)
+			goto cannot_emulate;
+		else
+			goto writeback;
+		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
@@ -2051,6 +2265,18 @@  twobyte_insn:
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
 		break;
+	case 0x34: /* sysenter */
+		if (emulate_syscalls(ctxt, EMUL_SYSENTER) == -1)
+			goto cannot_emulate;
+		else
+			goto writeback;
+		break;
+	case 0x35: /* sysexit */
+		if (emulate_syscalls(ctxt, EMUL_SYSEXIT) == -1)
+			goto cannot_emulate;
+		else
+			goto writeback;
+		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
 		if (!test_cc(c->b, ctxt->eflags))