diff mbox

[PATCHv5,4/4] KVM: emulator: optimize "rep ins" handling.

Message ID 1343659101-24877-5-git-send-email-gleb@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gleb Natapov July 30, 2012, 2:38 p.m. UTC
Optimize "rep ins" by allowing emulator to write back more than one
datum at a time. Introduce new operand type OP_MEM_STR which tells
writeback() that dst contains pointer to an array that should be written
back as opposite to just one data element.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |    4 +++-
 arch/x86/kvm/emulate.c             |   33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 6 deletions(-)

Comments

Avi Kivity Aug. 5, 2012, 3:03 p.m. UTC | #1
On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> Optimize "rep ins" by allowing emulator to write back more than one
> datum at a time. Introduce new operand type OP_MEM_STR which tells
> writeback() that dst contains pointer to an array that should be written
> back as opposite to just one data element.
> 
>  
>  	if (ctxt->rep_prefix && (ctxt->d & String)) {
> +		unsigned int count;
>  		struct read_cache *r = &ctxt->io_read;
> -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
> +		if ((ctxt->d & SrcMask) == SrcSI)
> +			count = ctxt->src.count;
> +		else
> +			count = ctxt->dst.count;
> +		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX],
> +				-count);
>  

count is unsigned.  Does it sign extend correctly in
register_address_increment()?
Gleb Natapov Aug. 5, 2012, 3:18 p.m. UTC | #2
On Sun, Aug 05, 2012 at 06:03:12PM +0300, Avi Kivity wrote:
> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> > Optimize "rep ins" by allowing emulator to write back more than one
> > datum at a time. Introduce new operand type OP_MEM_STR which tells
> > writeback() that dst contains pointer to an array that should be written
> > back as opposite to just one data element.
> > 
> >  
> >  	if (ctxt->rep_prefix && (ctxt->d & String)) {
> > +		unsigned int count;
> >  		struct read_cache *r = &ctxt->io_read;
> > -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
> > +		if ((ctxt->d & SrcMask) == SrcSI)
> > +			count = ctxt->src.count;
> > +		else
> > +			count = ctxt->dst.count;
> > +		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX],
> > +				-count);
> >  
> 
> count is unsigned.  Does it sign extend correctly in
> register_address_increment()?
> 
I think it sign extent before register_address_increment() when compiler
sees -count. count is in the range 1-1024 here, so there shouldn't be a
problem. By I welcome better suggestions.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 5, 2012, 3:20 p.m. UTC | #3
On 08/05/2012 06:18 PM, Gleb Natapov wrote:
> On Sun, Aug 05, 2012 at 06:03:12PM +0300, Avi Kivity wrote:
>> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
>> > Optimize "rep ins" by allowing emulator to write back more than one
>> > datum at a time. Introduce new operand type OP_MEM_STR which tells
>> > writeback() that dst contains pointer to an array that should be written
>> > back as opposite to just one data element.
>> > 
>> >  
>> >  	if (ctxt->rep_prefix && (ctxt->d & String)) {
>> > +		unsigned int count;
>> >  		struct read_cache *r = &ctxt->io_read;
>> > -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
>> > +		if ((ctxt->d & SrcMask) == SrcSI)
>> > +			count = ctxt->src.count;
>> > +		else
>> > +			count = ctxt->dst.count;
>> > +		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX],
>> > +				-count);
>> >  
>> 
>> count is unsigned.  Does it sign extend correctly in
>> register_address_increment()?
>> 
> I think it sign extent before register_address_increment() when compiler
> sees -count. count is in the range 1-1024 here, so there shouldn't be a
> problem. By I welcome better suggestions.

There is actually no problem since the 'inc' parameter is signed.
Avi Kivity Aug. 6, 2012, 8:50 a.m. UTC | #4
On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> Optimize "rep ins" by allowing emulator to write back more than one
> datum at a time. Introduce new operand type OP_MEM_STR which tells
> writeback() that dst contains pointer to an array that should be written
> back as opposite to just one data element.
> 
>  	}
>  
> -	memcpy(dest, rc->data + rc->pos, size);
> -	rc->pos += size;
> +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
> +		ctxt->dst.data = rc->data + rc->pos;
> +		ctxt->dst.type = OP_MEM_STR;
> +		ctxt->dst.count = (rc->end - rc->pos) / size;
> +		rc->pos = rc->end;

Should take into account the segment limit.

> +	} else {
> +		memcpy(dest, rc->data + rc->pos, size);
> +		rc->pos += size;
> +	}
>  	return 1;
>  }
>  
> @@ -1500,6 +1507,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
>  		if (rc != X86EMUL_CONTINUE)
>  			return rc;
>  		break;
> +	case OP_MEM_STR:
> +		rc = segmented_write(ctxt,
> +				ctxt->dst.addr.mem,
> +				ctxt->dst.data,
> +				ctxt->dst.bytes * ctxt->dst.count);
> +		if (rc != X86EMUL_CONTINUE)
> +			return rc;
> +		break;
>  	case OP_XMM:
>  		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
>  		break;
> @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
>  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
>  		struct operand *op)
>  {
> -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
> +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
>  
>  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
>  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
> @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
>  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
>  	I(SrcImmByte | Mov | Stack, em_push),
>  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
> -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
> +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */

Eww.

>  	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
>  	/* 0x70 - 0x7F */
>  	X16(D(SrcImmByte)),
> @@ -3930,6 +3945,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
>  			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
>  		op->addr.mem.seg = VCPU_SREG_ES;
>  		op->val = 0;
> +		op->count = 1;
>  		break;
>  	case OpDX:
>  		op->type = OP_REG;
> @@ -3973,6 +3989,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
>  			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
>  		op->addr.mem.seg = seg_override(ctxt);
>  		op->val = 0;
> +		op->count = 1;
>  		break;
>  	case OpImmFAddr:
>  		op->type = OP_IMM;
> @@ -4513,8 +4530,14 @@ writeback:
>  		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
>  
>  	if (ctxt->rep_prefix && (ctxt->d & String)) {
> +		unsigned int count;
>  		struct read_cache *r = &ctxt->io_read;
> -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
> +		if ((ctxt->d & SrcMask) == SrcSI)
> +			count = ctxt->src.count;
> +		else
> +			count = ctxt->dst.count;

Does this work correctly for 'rep movs' and friends?
Gleb Natapov Aug. 6, 2012, 8:58 a.m. UTC | #5
On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> > Optimize "rep ins" by allowing emulator to write back more than one
> > datum at a time. Introduce new operand type OP_MEM_STR which tells
> > writeback() that dst contains pointer to an array that should be written
> > back as opposite to just one data element.
> > 
> >  	}
> >  
> > -	memcpy(dest, rc->data + rc->pos, size);
> > -	rc->pos += size;
> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
> > +		ctxt->dst.data = rc->data + rc->pos;
> > +		ctxt->dst.type = OP_MEM_STR;
> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
> > +		rc->pos = rc->end;
> 
> Should take into account the segment limit.
> 
It does. During write back. pio_in_emulated() should linearize() address
before calculating page boundary, but this is (minor) bug unrelated to the patch
series.

> > +	} else {
> > +		memcpy(dest, rc->data + rc->pos, size);
> > +		rc->pos += size;
> > +	}
> >  	return 1;
> >  }
> >  
> > @@ -1500,6 +1507,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
> >  		if (rc != X86EMUL_CONTINUE)
> >  			return rc;
> >  		break;
> > +	case OP_MEM_STR:
> > +		rc = segmented_write(ctxt,
> > +				ctxt->dst.addr.mem,
> > +				ctxt->dst.data,
> > +				ctxt->dst.bytes * ctxt->dst.count);
> > +		if (rc != X86EMUL_CONTINUE)
> > +			return rc;
> > +		break;
> >  	case OP_XMM:
> >  		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
> >  		break;
> > @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
> >  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
> >  		struct operand *op)
> >  {
> > -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
> > +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
> >  
> >  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
> >  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
> > @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
> >  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
> >  	I(SrcImmByte | Mov | Stack, em_push),
> >  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
> > -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
> > +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
> 
> Eww.
This brings us back to the question what alignment check is doing in
linearize :)

> 
> >  	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
> >  	/* 0x70 - 0x7F */
> >  	X16(D(SrcImmByte)),
> > @@ -3930,6 +3945,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
> >  			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
> >  		op->addr.mem.seg = VCPU_SREG_ES;
> >  		op->val = 0;
> > +		op->count = 1;
> >  		break;
> >  	case OpDX:
> >  		op->type = OP_REG;
> > @@ -3973,6 +3989,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
> >  			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
> >  		op->addr.mem.seg = seg_override(ctxt);
> >  		op->val = 0;
> > +		op->count = 1;
> >  		break;
> >  	case OpImmFAddr:
> >  		op->type = OP_IMM;
> > @@ -4513,8 +4530,14 @@ writeback:
> >  		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
> >  
> >  	if (ctxt->rep_prefix && (ctxt->d & String)) {
> > +		unsigned int count;
> >  		struct read_cache *r = &ctxt->io_read;
> > -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
> > +		if ((ctxt->d & SrcMask) == SrcSI)
> > +			count = ctxt->src.count;
> > +		else
> > +			count = ctxt->dst.count;
> 
> Does this work correctly for 'rep movs' and friends?
> 
(src|dst).count is initialized to 1 during decode, so anything that does
not touch "count" behaves exactly like before.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 6, 2012, 9:28 a.m. UTC | #6
On 08/06/2012 11:58 AM, Gleb Natapov wrote:
> On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
>> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
>> > Optimize "rep ins" by allowing emulator to write back more than one
>> > datum at a time. Introduce new operand type OP_MEM_STR which tells
>> > writeback() that dst contains pointer to an array that should be written
>> > back as opposite to just one data element.
>> > 
>> >  	}
>> >  
>> > -	memcpy(dest, rc->data + rc->pos, size);
>> > -	rc->pos += size;
>> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
>> > +		ctxt->dst.data = rc->data + rc->pos;
>> > +		ctxt->dst.type = OP_MEM_STR;
>> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
>> > +		rc->pos = rc->end;
>> 
>> Should take into account the segment limit.
>> 
> It does. During write back. pio_in_emulated() should linearize() address
> before calculating page boundary, but this is (minor) bug unrelated to the patch
> series.

I see, yes, this problem preexists.

However, in normal conditions, non-repeating instructions will not reach
the emulator at all since they will fault in the guest (or in the shadow
mmu, which will reflect the fault to the guest).  Here, the first
iteration may fit in the segment but the second will not, so this will fail.

It's not a huge problem since no guest does this.

>> > @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
>> >  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
>> >  		struct operand *op)
>> >  {
>> > -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
>> > +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
>> >  
>> >  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
>> >  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
>> > @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
>> >  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
>> >  	I(SrcImmByte | Mov | Stack, em_push),
>> >  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
>> > -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
>> > +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
>> 
>> Eww.
> This brings us back to the question what alignment check is doing in
> linearize :)

It's checking alignment...

Let's see how we would fix this mess.  We need to move linearization
(and virt->phys translation) to the decode stage, or perhaps the
execution state, but before instruction dispatch.  This would cause all
the various exceptions to be checked against before execution, and would
avoid double translation for RMW operands.


>> >  		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
>> >  
>> >  	if (ctxt->rep_prefix && (ctxt->d & String)) {
>> > +		unsigned int count;
>> >  		struct read_cache *r = &ctxt->io_read;
>> > -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
>> > +		if ((ctxt->d & SrcMask) == SrcSI)
>> > +			count = ctxt->src.count;
>> > +		else
>> > +			count = ctxt->dst.count;
>> 
>> Does this work correctly for 'rep movs' and friends?
>> 
> (src|dst).count is initialized to 1 during decode, so anything that does
> not touch "count" behaves exactly like before.

Ok.
Gleb Natapov Aug. 6, 2012, 11:05 a.m. UTC | #7
On Mon, Aug 06, 2012 at 12:28:05PM +0300, Avi Kivity wrote:
> On 08/06/2012 11:58 AM, Gleb Natapov wrote:
> > On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
> >> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> >> > Optimize "rep ins" by allowing emulator to write back more than one
> >> > datum at a time. Introduce new operand type OP_MEM_STR which tells
> >> > writeback() that dst contains pointer to an array that should be written
> >> > back as opposite to just one data element.
> >> > 
> >> >  	}
> >> >  
> >> > -	memcpy(dest, rc->data + rc->pos, size);
> >> > -	rc->pos += size;
> >> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
> >> > +		ctxt->dst.data = rc->data + rc->pos;
> >> > +		ctxt->dst.type = OP_MEM_STR;
> >> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
> >> > +		rc->pos = rc->end;
> >> 
> >> Should take into account the segment limit.
> >> 
> > It does. During write back. pio_in_emulated() should linearize() address
> > before calculating page boundary, but this is (minor) bug unrelated to the patch
> > series.
> 
> I see, yes, this problem preexists.
> 
> However, in normal conditions, non-repeating instructions will not reach
> the emulator at all since they will fault in the guest (or in the shadow
> mmu, which will reflect the fault to the guest).  Here, the first
> iteration may fit in the segment but the second will not, so this will fail.
> 
Correct. And this can happen with or without the patch series.

> It's not a huge problem since no guest does this.
> 
> >> > @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
> >> >  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
> >> >  		struct operand *op)
> >> >  {
> >> > -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
> >> > +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
> >> >  
> >> >  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
> >> >  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
> >> > @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
> >> >  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
> >> >  	I(SrcImmByte | Mov | Stack, em_push),
> >> >  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
> >> > -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
> >> > +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
> >> 
> >> Eww.
> > This brings us back to the question what alignment check is doing in
> > linearize :)
> 
> It's checking alignment...
> 
It either check it in a wrong place or we need to mark all instructions
that do not care about alignment, so the patch is not "Eww" :)

> Let's see how we would fix this mess.  We need to move linearization
> (and virt->phys translation) to the decode stage, or perhaps the
> execution state, but before instruction dispatch.  This would cause all
> the various exceptions to be checked against before execution, and would
> avoid double translation for RMW operands.
> 
Execution state likely. String instruction works on segmented address
for instance (address increment/decrement). May be there are others.

> 
> >> >  		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
> >> >  
> >> >  	if (ctxt->rep_prefix && (ctxt->d & String)) {
> >> > +		unsigned int count;
> >> >  		struct read_cache *r = &ctxt->io_read;
> >> > -		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
> >> > +		if ((ctxt->d & SrcMask) == SrcSI)
> >> > +			count = ctxt->src.count;
> >> > +		else
> >> > +			count = ctxt->dst.count;
> >> 
> >> Does this work correctly for 'rep movs' and friends?
> >> 
> > (src|dst).count is initialized to 1 during decode, so anything that does
> > not touch "count" behaves exactly like before.
> 
> Ok.
> 
> 
> -- 
> error compiling committee.c: too many arguments to function

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 6, 2012, 11:39 a.m. UTC | #8
On 08/06/2012 02:05 PM, Gleb Natapov wrote:
> On Mon, Aug 06, 2012 at 12:28:05PM +0300, Avi Kivity wrote:
>> On 08/06/2012 11:58 AM, Gleb Natapov wrote:
>> > On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
>> >> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
>> >> > Optimize "rep ins" by allowing emulator to write back more than one
>> >> > datum at a time. Introduce new operand type OP_MEM_STR which tells
>> >> > writeback() that dst contains pointer to an array that should be written
>> >> > back as opposite to just one data element.
>> >> > 
>> >> >  	}
>> >> >  
>> >> > -	memcpy(dest, rc->data + rc->pos, size);
>> >> > -	rc->pos += size;
>> >> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
>> >> > +		ctxt->dst.data = rc->data + rc->pos;
>> >> > +		ctxt->dst.type = OP_MEM_STR;
>> >> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
>> >> > +		rc->pos = rc->end;
>> >> 
>> >> Should take into account the segment limit.
>> >> 
>> > It does. During write back. pio_in_emulated() should linearize() address
>> > before calculating page boundary, but this is (minor) bug unrelated to the patch
>> > series.
>> 
>> I see, yes, this problem preexists.
>> 
>> However, in normal conditions, non-repeating instructions will not reach
>> the emulator at all since they will fault in the guest (or in the shadow
>> mmu, which will reflect the fault to the guest).  Here, the first
>> iteration may fit in the segment but the second will not, so this will fail.
>> 
> Correct. And this can happen with or without the patch series.

No, it can't.  Ordinarily ins will trap inside the guest.

> 
>> It's not a huge problem since no guest does this.
>> 
>> >> > @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
>> >> >  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
>> >> >  		struct operand *op)
>> >> >  {
>> >> > -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
>> >> > +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
>> >> >  
>> >> >  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
>> >> >  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
>> >> > @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
>> >> >  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
>> >> >  	I(SrcImmByte | Mov | Stack, em_push),
>> >> >  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
>> >> > -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
>> >> > +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
>> >> 
>> >> Eww.
>> > This brings us back to the question what alignment check is doing in
>> > linearize :)
>> 
>> It's checking alignment...
>> 
> It either check it in a wrong place or we need to mark all instructions
> that do not care about alignment, so the patch is not "Eww" :)

If not there, where?

16-byte sse instructions, cmpxchg16b, fxsave/fxrstor all check for 16
byte alignment.  There is also the #AC exception.  I couldn't find in
the SDM whether linear or virtual addresses are checked, but I'm
guessing linear.

Another way to work around this is to pass size/count separately.

> 
>> Let's see how we would fix this mess.  We need to move linearization
>> (and virt->phys translation) to the decode stage, or perhaps the
>> execution state, but before instruction dispatch.  This would cause all
>> the various exceptions to be checked against before execution, and would
>> avoid double translation for RMW operands.
>> 
> Execution state likely. String instruction works on segmented address
> for instance (address increment/decrement). May be there are others.

Practically everything works on segmented addresses.
Gleb Natapov Aug. 6, 2012, 11:49 a.m. UTC | #9
On Mon, Aug 06, 2012 at 02:39:52PM +0300, Avi Kivity wrote:
> On 08/06/2012 02:05 PM, Gleb Natapov wrote:
> > On Mon, Aug 06, 2012 at 12:28:05PM +0300, Avi Kivity wrote:
> >> On 08/06/2012 11:58 AM, Gleb Natapov wrote:
> >> > On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
> >> >> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> >> >> > Optimize "rep ins" by allowing emulator to write back more than one
> >> >> > datum at a time. Introduce new operand type OP_MEM_STR which tells
> >> >> > writeback() that dst contains pointer to an array that should be written
> >> >> > back as opposite to just one data element.
> >> >> > 
> >> >> >  	}
> >> >> >  
> >> >> > -	memcpy(dest, rc->data + rc->pos, size);
> >> >> > -	rc->pos += size;
> >> >> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
> >> >> > +		ctxt->dst.data = rc->data + rc->pos;
> >> >> > +		ctxt->dst.type = OP_MEM_STR;
> >> >> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
> >> >> > +		rc->pos = rc->end;
> >> >> 
> >> >> Should take into account the segment limit.
> >> >> 
> >> > It does. During write back. pio_in_emulated() should linearize() address
> >> > before calculating page boundary, but this is (minor) bug unrelated to the patch
> >> > series.
> >> 
> >> I see, yes, this problem preexists.
> >> 
> >> However, in normal conditions, non-repeating instructions will not reach
> >> the emulator at all since they will fault in the guest (or in the shadow
> >> mmu, which will reflect the fault to the guest).  Here, the first
> >> iteration may fit in the segment but the second will not, so this will fail.
> >> 
> > Correct. And this can happen with or without the patch series.
> 
> No, it can't.  Ordinarily ins will trap inside the guest.
> 
We do not go to a guest for each iteration. In fact we will not go to a
guest for exactly "count" iterations.

> > 
> >> It's not a huge problem since no guest does this.
> >> 
> >> >> > @@ -2732,7 +2747,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
> >> >> >  static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
> >> >> >  		struct operand *op)
> >> >> >  {
> >> >> > -	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
> >> >> > +	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
> >> >> >  
> >> >> >  	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
> >> >> >  	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
> >> >> > @@ -3672,7 +3687,7 @@ static struct opcode opcode_table[256] = {
> >> >> >  	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
> >> >> >  	I(SrcImmByte | Mov | Stack, em_push),
> >> >> >  	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
> >> >> > -	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
> >> >> > +	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
> >> >> 
> >> >> Eww.
> >> > This brings us back to the question what alignment check is doing in
> >> > linearize :)
> >> 
> >> It's checking alignment...
> >> 
> > It either check it in a wrong place or we need to mark all instructions
> > that do not care about alignment, so the patch is not "Eww" :)
> 
> If not there, where?
> 
During execution if instruction requires alignment? Why don't you like marking
instruction as Unaligned?
 
> 16-byte sse instructions, cmpxchg16b, fxsave/fxrstor all check for 16
> byte alignment.  There is also the #AC exception.  I couldn't find in
> the SDM whether linear or virtual addresses are checked, but I'm
> guessing linear.
> 
> Another way to work around this is to pass size/count separately.
> 
> > 
> >> Let's see how we would fix this mess.  We need to move linearization
> >> (and virt->phys translation) to the decode stage, or perhaps the
> >> execution state, but before instruction dispatch.  This would cause all
> >> the various exceptions to be checked against before execution, and would
> >> avoid double translation for RMW operands.
> >> 
> > Execution state likely. String instruction works on segmented address
> > for instance (address increment/decrement). May be there are others.
> 
> Practically everything works on segmented addresses.
> 
Hmm, true. We can calculate liner address whenever it is needed and
cache it. If address changes cache is invalidated.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 6, 2012, 12:08 p.m. UTC | #10
On 08/06/2012 02:49 PM, Gleb Natapov wrote:
> On Mon, Aug 06, 2012 at 02:39:52PM +0300, Avi Kivity wrote:
>> On 08/06/2012 02:05 PM, Gleb Natapov wrote:
>> > On Mon, Aug 06, 2012 at 12:28:05PM +0300, Avi Kivity wrote:
>> >> On 08/06/2012 11:58 AM, Gleb Natapov wrote:
>> >> > On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
>> >> >> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
>> >> >> > Optimize "rep ins" by allowing emulator to write back more than one
>> >> >> > datum at a time. Introduce new operand type OP_MEM_STR which tells
>> >> >> > writeback() that dst contains pointer to an array that should be written
>> >> >> > back as opposite to just one data element.
>> >> >> > 
>> >> >> >  	}
>> >> >> >  
>> >> >> > -	memcpy(dest, rc->data + rc->pos, size);
>> >> >> > -	rc->pos += size;
>> >> >> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
>> >> >> > +		ctxt->dst.data = rc->data + rc->pos;
>> >> >> > +		ctxt->dst.type = OP_MEM_STR;
>> >> >> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
>> >> >> > +		rc->pos = rc->end;
>> >> >> 
>> >> >> Should take into account the segment limit.
>> >> >> 
>> >> > It does. During write back. pio_in_emulated() should linearize() address
>> >> > before calculating page boundary, but this is (minor) bug unrelated to the patch
>> >> > series.
>> >> 
>> >> I see, yes, this problem preexists.
>> >> 
>> >> However, in normal conditions, non-repeating instructions will not reach
>> >> the emulator at all since they will fault in the guest (or in the shadow
>> >> mmu, which will reflect the fault to the guest).  Here, the first
>> >> iteration may fit in the segment but the second will not, so this will fail.
>> >> 
>> > Correct. And this can happen with or without the patch series.
>> 
>> No, it can't.  Ordinarily ins will trap inside the guest.
>> 
> We do not go to a guest for each iteration. In fact we will not go to a
> guest for exactly "count" iterations.

Ok.

If we linearize and translate pre-execution we can keep track of the
remaining space available in the segment/page and do this correctly.

>> >> > This brings us back to the question what alignment check is doing in
>> >> > linearize :)
>> >> 
>> >> It's checking alignment...
>> >> 
>> > It either check it in a wrong place or we need to mark all instructions
>> > that do not care about alignment, so the patch is not "Eww" :)
>> 
>> If not there, where?
>> 
> During execution if instruction requires alignment? 

Too many (all sse) instructions require alignment.

> Why don't you like marking
> instruction as Unaligned?

Because it's a workaround for a side effect of the implementation.  At a
minimum it needs a comment.

>> >> 
>> > Execution state likely. String instruction works on segmented address
>> > for instance (address increment/decrement). May be there are others.
>> 
>> Practically everything works on segmented addresses.
>> 
> Hmm, true. We can calculate liner address whenever it is needed and
> cache it. If address changes cache is invalidated.

The correct thing is to check before, like the processor does.  For
example linearize also checks write permissions, so for RMW it needs to
check writes before performing the first read.

Also cmovcc performs the checks even though it might not perform the access.
Gleb Natapov Aug. 7, 2012, 12:07 p.m. UTC | #11
On Mon, Aug 06, 2012 at 03:08:28PM +0300, Avi Kivity wrote:
> On 08/06/2012 02:49 PM, Gleb Natapov wrote:
> > On Mon, Aug 06, 2012 at 02:39:52PM +0300, Avi Kivity wrote:
> >> On 08/06/2012 02:05 PM, Gleb Natapov wrote:
> >> > On Mon, Aug 06, 2012 at 12:28:05PM +0300, Avi Kivity wrote:
> >> >> On 08/06/2012 11:58 AM, Gleb Natapov wrote:
> >> >> > On Mon, Aug 06, 2012 at 11:50:20AM +0300, Avi Kivity wrote:
> >> >> >> On 07/30/2012 05:38 PM, Gleb Natapov wrote:
> >> >> >> > Optimize "rep ins" by allowing emulator to write back more than one
> >> >> >> > datum at a time. Introduce new operand type OP_MEM_STR which tells
> >> >> >> > writeback() that dst contains pointer to an array that should be written
> >> >> >> > back as opposite to just one data element.
> >> >> >> > 
> >> >> >> >  	}
> >> >> >> >  
> >> >> >> > -	memcpy(dest, rc->data + rc->pos, size);
> >> >> >> > -	rc->pos += size;
> >> >> >> > +	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
> >> >> >> > +		ctxt->dst.data = rc->data + rc->pos;
> >> >> >> > +		ctxt->dst.type = OP_MEM_STR;
> >> >> >> > +		ctxt->dst.count = (rc->end - rc->pos) / size;
> >> >> >> > +		rc->pos = rc->end;
> >> >> >> 
> >> >> >> Should take into account the segment limit.
> >> >> >> 
> >> >> > It does. During write back. pio_in_emulated() should linearize() address
> >> >> > before calculating page boundary, but this is (minor) bug unrelated to the patch
> >> >> > series.
> >> >> 
> >> >> I see, yes, this problem preexists.
> >> >> 
> >> >> However, in normal conditions, non-repeating instructions will not reach
> >> >> the emulator at all since they will fault in the guest (or in the shadow
> >> >> mmu, which will reflect the fault to the guest).  Here, the first
> >> >> iteration may fit in the segment but the second will not, so this will fail.
> >> >> 
> >> > Correct. And this can happen with or without the patch series.
> >> 
> >> No, it can't.  Ordinarily ins will trap inside the guest.
> >> 
> > We do not go to a guest for each iteration. In fact we will not go to a
> > guest for exactly "count" iterations.
> 
> Ok.
> 
> If we linearize and translate pre-execution we can keep track of the
> remaining space available in the segment/page and do this correctly.
> 
Absolutely.

> >> >> > This brings us back to the question what alignment check is doing in
> >> >> > linearize :)
> >> >> 
> >> >> It's checking alignment...
> >> >> 
> >> > It either check it in a wrong place or we need to mark all instructions
> >> > that do not care about alignment, so the patch is not "Eww" :)
> >> 
> >> If not there, where?
> >> 
> > During execution if instruction requires alignment? 
> 
> Too many (all sse) instructions require alignment.
> 
But most do not. Although they do not work on big chunk of data either.

> > Why don't you like marking
> > instruction as Unaligned?
> 
> Because it's a workaround for a side effect of the implementation.  At a
> minimum it needs a comment.
> 
So add a comment and resend? Any other pre-requests before the series is
accepted?

> >> >> 
> >> > Execution state likely. String instruction works on segmented address
> >> > for instance (address increment/decrement). May be there are others.
> >> 
> >> Practically everything works on segmented addresses.
> >> 
> > Hmm, true. We can calculate liner address whenever it is needed and
> > cache it. If address changes cache is invalidated.
> 
> The correct thing is to check before, like the processor does.  For
Are you sure processor checks before? If you do "in" to an address that
lays outside of a segment PIO will not happen before a fault?

> example linearize also checks write permissions, so for RMW it needs to
> check writes before performing the first read.
> 
> Also cmovcc performs the checks even though it might not perform the access.
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8d0fe8f..d1777f8 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -200,8 +200,9 @@  typedef u32 __attribute__((vector_size(16))) sse128_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+	enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
 	unsigned int bytes;
+	unsigned int count;
 	union {
 		unsigned long orig_val;
 		u64 orig_val64;
@@ -221,6 +222,7 @@  struct operand {
 		char valptr[sizeof(unsigned long) + 2];
 		sse128_t vec_val;
 		u64 mm_val;
+		void *data;
 	};
 };
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c22762d..c74bce8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1251,8 +1251,15 @@  static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 		rc->end = n * size;
 	}
 
-	memcpy(dest, rc->data + rc->pos, size);
-	rc->pos += size;
+	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+		ctxt->dst.data = rc->data + rc->pos;
+		ctxt->dst.type = OP_MEM_STR;
+		ctxt->dst.count = (rc->end - rc->pos) / size;
+		rc->pos = rc->end;
+	} else {
+		memcpy(dest, rc->data + rc->pos, size);
+		rc->pos += size;
+	}
 	return 1;
 }
 
@@ -1500,6 +1507,14 @@  static int writeback(struct x86_emulate_ctxt *ctxt)
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
+	case OP_MEM_STR:
+		rc = segmented_write(ctxt,
+				ctxt->dst.addr.mem,
+				ctxt->dst.data,
+				ctxt->dst.bytes * ctxt->dst.count);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		break;
 	case OP_XMM:
 		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
@@ -2732,7 +2747,7 @@  int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
 		struct operand *op)
 {
-	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
 
 	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
 	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
@@ -3672,7 +3687,7 @@  static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
 	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
@@ -3930,6 +3945,7 @@  static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
 		op->addr.mem.seg = VCPU_SREG_ES;
 		op->val = 0;
+		op->count = 1;
 		break;
 	case OpDX:
 		op->type = OP_REG;
@@ -3973,6 +3989,7 @@  static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
 		op->addr.mem.seg = seg_override(ctxt);
 		op->val = 0;
+		op->count = 1;
 		break;
 	case OpImmFAddr:
 		op->type = OP_IMM;
@@ -4513,8 +4530,14 @@  writeback:
 		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
 
 	if (ctxt->rep_prefix && (ctxt->d & String)) {
+		unsigned int count;
 		struct read_cache *r = &ctxt->io_read;
-		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+		if ((ctxt->d & SrcMask) == SrcSI)
+			count = ctxt->src.count;
+		else
+			count = ctxt->dst.count;
+		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX],
+				-count);
 
 		if (!string_insn_completed(ctxt)) {
 			/*