Message ID | 5081033C.4060503@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: > After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), > the pieces of io data can be collected and write them to the guest memory > or MMIO together. > > Unfortunately, kvm splits the mmio access into 8 bytes and store them to > vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it > will cause vcpu->mmio_fragments overflow > > The bug can be exposed by isapc (-M isapc): > > [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC > [ ......] > [23154.858083] Call Trace: > [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] > [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] > [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] > > > Actually, we can use one mmio_fragment to store a large mmio access for the > mmio access is always continuous then split it when we pass the mmio-exit-info > to userspace. After that, we only need two entries to store mmio info for > the cross-mmio pages access > I wonder can we put the data into coalesced mmio buffer instead of exiting for each 8 bytes? Is it worth the complexity? > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> > --- > arch/x86/kvm/x86.c | 127 +++++++++++++++++++++++++++++----------------- > include/linux/kvm_host.h | 16 +----- > 2 files changed, 84 insertions(+), 59 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 8b90dd5..41ceb51 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, > static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, > void *val, int bytes) > { > - struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; > - > - memcpy(vcpu->run->mmio.data, frag->data, frag->len); > return X86EMUL_CONTINUE; > } > > @@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = { > .write = true, > }; > > +static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa, > + unsigned *len, void **data) > +{ > + struct kvm_mmio_fragment *frag; > + int cur = vcpu->mmio_cur_fragment; > + > + if (cur >= vcpu->mmio_nr_fragments) > + return false; > + > + frag = &vcpu->mmio_fragments[cur]; > + if (frag->pos >= frag->len) { > + if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) > + return false; > + frag++; > + } > + > + *gpa = frag->gpa + frag->pos; > + *data = frag->data + frag->pos; > + *len = min(8u, frag->len - frag->pos); > + return true; > +} > + > +static void complete_current_mmio(struct kvm_vcpu *vcpu) > +{ > + struct kvm_mmio_fragment *frag; > + gpa_t gpa; > + unsigned len; > + void *data; > + > + get_current_mmio_info(vcpu, &gpa, &len, &data); > + > + if (!vcpu->mmio_is_write) > + memcpy(data, vcpu->run->mmio.data, len); > + > + /* Increase frag->pos to switch to the next mmio. */ > + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; > + frag->pos += len; > +} > + > +static bool vcpu_fill_mmio_exit_info(struct kvm_vcpu *vcpu) > +{ > + gpa_t gpa; > + unsigned len; > + void *data; > + > + if (!get_current_mmio_info(vcpu, &gpa, &len, &data)) > + return false; > + > + vcpu->run->mmio.len = len; > + vcpu->run->mmio.is_write = vcpu->mmio_is_write; > + vcpu->run->exit_reason = KVM_EXIT_MMIO; > + vcpu->run->mmio.phys_addr = gpa; > + > + if (vcpu->mmio_is_write) > + memcpy(vcpu->run->mmio.data, data, len); > + return true; > +} > + > static int emulator_read_write_onepage(unsigned long addr, void *val, > unsigned int bytes, > struct x86_exception *exception, > @@ -3834,18 +3889,12 @@ mmio: > bytes -= handled; > val += handled; > > - while (bytes) { > - unsigned now = min(bytes, 8U); > - > - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; > - frag->gpa = gpa; > - frag->data = val; > - frag->len = now; > - > - gpa += now; > - val += now; > - bytes -= now; > - } > + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); > + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; > + frag->pos = 0; > + frag->gpa = gpa; > + frag->data = val; > + frag->len = bytes; > return X86EMUL_CONTINUE; > } > > @@ -3855,7 +3904,6 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, > const struct read_write_emulator_ops *ops) > { > struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); > - gpa_t gpa; > int rc; > > if (ops->read_write_prepare && > @@ -3887,17 +3935,13 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, > if (!vcpu->mmio_nr_fragments) > return rc; > > - gpa = vcpu->mmio_fragments[0].gpa; > - > vcpu->mmio_needed = 1; > vcpu->mmio_cur_fragment = 0; > + vcpu->mmio_is_write = ops->write; > > - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; > - vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > - vcpu->run->mmio.phys_addr = gpa; > - > - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); > + vcpu_fill_mmio_exit_info(vcpu); > + return ops->read_write_exit_mmio(vcpu, vcpu->mmio_fragments[0].gpa, > + val, bytes); > } > > static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, > @@ -5524,43 +5568,34 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) > * > * read: > * for each fragment > - * write gpa, len > - * exit > - * copy data > + * for each mmio piece in the fragment > + * write gpa, len > + * exit > + * copy data > * execute insn > * > * write: > * for each fragment > - * write gpa, len > - * copy data > - * exit > + * for each mmio piece in the fragment > + * write gpa, len > + * copy data > + * exit > */ > static int complete_emulated_mmio(struct kvm_vcpu *vcpu) > { > - struct kvm_run *run = vcpu->run; > - struct kvm_mmio_fragment *frag; > - > BUG_ON(!vcpu->mmio_needed); > > - /* Complete previous fragment */ > - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; > - if (!vcpu->mmio_is_write) > - memcpy(frag->data, run->mmio.data, frag->len); > - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { > + complete_current_mmio(vcpu); > + > + /* Initiate next fragment */ > + if (!vcpu_fill_mmio_exit_info(vcpu)) { > vcpu->mmio_needed = 0; > if (vcpu->mmio_is_write) > return 1; > vcpu->mmio_read_completed = 1; > return complete_emulated_io(vcpu); > } > - /* Initiate next fragment */ > - ++frag; > - run->exit_reason = KVM_EXIT_MMIO; > - run->mmio.phys_addr = frag->gpa; > - if (vcpu->mmio_is_write) > - memcpy(run->mmio.data, frag->data, frag->len); > - run->mmio.len = frag->len; > - run->mmio.is_write = vcpu->mmio_is_write; > + > vcpu->arch.complete_userspace_io = complete_emulated_mmio; > return 0; > } > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index e235068..f6c3c2f 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -42,19 +42,8 @@ > */ > #define KVM_MEMSLOT_INVALID (1UL << 16) > > -/* > - * If we support unaligned MMIO, at most one fragment will be split into two: > - */ > -#ifdef KVM_UNALIGNED_MMIO > -# define KVM_EXTRA_MMIO_FRAGMENTS 1 > -#else > -# define KVM_EXTRA_MMIO_FRAGMENTS 0 > -#endif > - > -#define KVM_USER_MMIO_SIZE 8 > - > -#define KVM_MAX_MMIO_FRAGMENTS \ > - (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) > +/* Two fragments for cross MMIO pages. */ > +#define KVM_MAX_MMIO_FRAGMENTS 2 > > /* > * For the normal pfn, the highest 12 bits should be zero, > @@ -203,6 +192,7 @@ struct kvm_mmio_fragment { > gpa_t gpa; > void *data; > unsigned len; > + unsigned pos; > }; > > struct kvm_vcpu { > -- > 1.7.7.6 -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/22/2012 05:16 PM, Gleb Natapov wrote: > On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: >> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), >> the pieces of io data can be collected and write them to the guest memory >> or MMIO together. >> >> Unfortunately, kvm splits the mmio access into 8 bytes and store them to >> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it >> will cause vcpu->mmio_fragments overflow >> >> The bug can be exposed by isapc (-M isapc): >> >> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC >> [ ......] >> [23154.858083] Call Trace: >> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] >> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] >> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] >> >> >> Actually, we can use one mmio_fragment to store a large mmio access for the >> mmio access is always continuous then split it when we pass the mmio-exit-info >> to userspace. After that, we only need two entries to store mmio info for >> the cross-mmio pages access >> > I wonder can we put the data into coalesced mmio buffer instead of If we put all mmio data into coalesced buffer, we should: - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register all mmio regions. - even if the MMIO region is not used by emulated-device, it also need to be registered. It will breaks old version userspace program. > exiting for each 8 bytes? Is it worth the complexity? Simpler way is always better but i failed, so i appreciate your guys comments. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote: > On 10/22/2012 05:16 PM, Gleb Natapov wrote: > > On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: > >> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), > >> the pieces of io data can be collected and write them to the guest memory > >> or MMIO together. > >> > >> Unfortunately, kvm splits the mmio access into 8 bytes and store them to > >> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it > >> will cause vcpu->mmio_fragments overflow > >> > >> The bug can be exposed by isapc (-M isapc): > >> > >> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC > >> [ ......] > >> [23154.858083] Call Trace: > >> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] > >> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] > >> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] > >> > >> > >> Actually, we can use one mmio_fragment to store a large mmio access for the > >> mmio access is always continuous then split it when we pass the mmio-exit-info > >> to userspace. After that, we only need two entries to store mmio info for > >> the cross-mmio pages access > >> > > I wonder can we put the data into coalesced mmio buffer instead of > > If we put all mmio data into coalesced buffer, we should: > - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register > all mmio regions. > It appears to not be so. Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from KVM_RUN which looks like this: void kvm_flush_coalesced_mmio_buffer(void) { KVMState *s = kvm_state; if (s->coalesced_flush_in_progress) { return; } s->coalesced_flush_in_progress = true; if (s->coalesced_mmio_ring) { struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; while (ring->first != ring->last) { struct kvm_coalesced_mmio *ent; ent = &ring->coalesced_mmio[ring->first]; cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); smp_wmb(); ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; } } s->coalesced_flush_in_progress = false; } Nowhere in this function we check that MMIO region was registered with KVM_REGISTER_COALESCED_MMIO. We do not even check that the address is MMIO. > - even if the MMIO region is not used by emulated-device, it also need to be > registered. Same. I think writes to non registered region will be discarded. > > It will breaks old version userspace program. > > > exiting for each 8 bytes? Is it worth the complexity? > > Simpler way is always better but i failed, so i appreciate your guys comments. > Why have you failed? Exiting for each 8 bytes is infinitely better than buffer overflow. My question about complexity was towards theoretically more complex code that will use coalesced MMIO buffer. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 13:23, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote: >> On 10/22/2012 05:16 PM, Gleb Natapov wrote: >>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: >>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), >>>> the pieces of io data can be collected and write them to the guest memory >>>> or MMIO together. >>>> >>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to >>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it >>>> will cause vcpu->mmio_fragments overflow >>>> >>>> The bug can be exposed by isapc (-M isapc): >>>> >>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC >>>> [ ......] >>>> [23154.858083] Call Trace: >>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] >>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] >>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] >>>> >>>> >>>> Actually, we can use one mmio_fragment to store a large mmio access for the >>>> mmio access is always continuous then split it when we pass the mmio-exit-info >>>> to userspace. After that, we only need two entries to store mmio info for >>>> the cross-mmio pages access >>>> >>> I wonder can we put the data into coalesced mmio buffer instead of >> >> If we put all mmio data into coalesced buffer, we should: >> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register >> all mmio regions. >> > It appears to not be so. > Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from > KVM_RUN which looks like this: Nope, no longer, only on accesses to devices that actually use such regions (and there are only two ATM). The current design of a global coalesced mmio ring is horrible /wrt latency. Jan
On Mon, Oct 22, 2012 at 01:35:56PM +0200, Jan Kiszka wrote: > On 2012-10-22 13:23, Gleb Natapov wrote: > > On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote: > >> On 10/22/2012 05:16 PM, Gleb Natapov wrote: > >>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: > >>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), > >>>> the pieces of io data can be collected and write them to the guest memory > >>>> or MMIO together. > >>>> > >>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to > >>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it > >>>> will cause vcpu->mmio_fragments overflow > >>>> > >>>> The bug can be exposed by isapc (-M isapc): > >>>> > >>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC > >>>> [ ......] > >>>> [23154.858083] Call Trace: > >>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] > >>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] > >>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] > >>>> > >>>> > >>>> Actually, we can use one mmio_fragment to store a large mmio access for the > >>>> mmio access is always continuous then split it when we pass the mmio-exit-info > >>>> to userspace. After that, we only need two entries to store mmio info for > >>>> the cross-mmio pages access > >>>> > >>> I wonder can we put the data into coalesced mmio buffer instead of > >> > >> If we put all mmio data into coalesced buffer, we should: > >> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register > >> all mmio regions. > >> > > It appears to not be so. > > Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from > > KVM_RUN which looks like this: > > Nope, no longer, only on accesses to devices that actually use such > regions (and there are only two ATM). The current design of a global > coalesced mmio ring is horrible /wrt latency. > Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() is gone. So this will break new userspace, not old. By global you mean shared between devices (or memory regions)? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 13:43, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 01:35:56PM +0200, Jan Kiszka wrote: >> On 2012-10-22 13:23, Gleb Natapov wrote: >>> On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote: >>>> On 10/22/2012 05:16 PM, Gleb Natapov wrote: >>>>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote: >>>>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), >>>>>> the pieces of io data can be collected and write them to the guest memory >>>>>> or MMIO together. >>>>>> >>>>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to >>>>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it >>>>>> will cause vcpu->mmio_fragments overflow >>>>>> >>>>>> The bug can be exposed by isapc (-M isapc): >>>>>> >>>>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC >>>>>> [ ......] >>>>>> [23154.858083] Call Trace: >>>>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] >>>>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] >>>>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] >>>>>> >>>>>> >>>>>> Actually, we can use one mmio_fragment to store a large mmio access for the >>>>>> mmio access is always continuous then split it when we pass the mmio-exit-info >>>>>> to userspace. After that, we only need two entries to store mmio info for >>>>>> the cross-mmio pages access >>>>>> >>>>> I wonder can we put the data into coalesced mmio buffer instead of >>>> >>>> If we put all mmio data into coalesced buffer, we should: >>>> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register >>>> all mmio regions. >>>> >>> It appears to not be so. >>> Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from >>> KVM_RUN which looks like this: >> >> Nope, no longer, only on accesses to devices that actually use such >> regions (and there are only two ATM). The current design of a global >> coalesced mmio ring is horrible /wrt latency. >> > Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() > is gone. So this will break new userspace, not old. By global you mean > shared between devices (or memory regions)? Yes. We only have a single ring per VM, so we cannot flush multi-second VGA access separately from other devices. In theory solvable by introducing per-region rings that can be driven separately. Jan
On 10/22/2012 01:45 PM, Jan Kiszka wrote: >> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() >> is gone. So this will break new userspace, not old. By global you mean >> shared between devices (or memory regions)? > > Yes. We only have a single ring per VM, so we cannot flush multi-second > VGA access separately from other devices. In theory solvable by > introducing per-region rings that can be driven separately. But in practice unneeded. Real time VMs can disable coalescing and not use planar VGA modes.
On 2012-10-22 14:18, Avi Kivity wrote: > On 10/22/2012 01:45 PM, Jan Kiszka wrote: > >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() >>> is gone. So this will break new userspace, not old. By global you mean >>> shared between devices (or memory regions)? >> >> Yes. We only have a single ring per VM, so we cannot flush multi-second >> VGA access separately from other devices. In theory solvable by >> introducing per-region rings that can be driven separately. > > But in practice unneeded. Real time VMs can disable coalescing and not > use planar VGA modes. A) At least right now, we do not differentiate between the VGA modes and if flushing is needed. So that device is generally taboo for RT cores of the VM. B) We need to disable coalescing in E1000 as well - if we want to use that model. C) Gleb seems to propose using coalescing far beyond those two use cases. Jan
On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote: > On 2012-10-22 14:18, Avi Kivity wrote: > > On 10/22/2012 01:45 PM, Jan Kiszka wrote: > > > >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() > >>> is gone. So this will break new userspace, not old. By global you mean > >>> shared between devices (or memory regions)? > >> > >> Yes. We only have a single ring per VM, so we cannot flush multi-second > >> VGA access separately from other devices. In theory solvable by > >> introducing per-region rings that can be driven separately. > > > > But in practice unneeded. Real time VMs can disable coalescing and not > > use planar VGA modes. > > A) At least right now, we do not differentiate between the VGA modes and > if flushing is needed. So that device is generally taboo for RT cores of > the VM. > B) We need to disable coalescing in E1000 as well - if we want to use > that model. > C) Gleb seems to propose using coalescing far beyond those two use cases. > Since the userspace change is needed the idea is dead, but if we could implement it I do not see how it can hurt the latency if it would be the only mechanism to use coalesced mmio buffer. Checking that the ring buffer is empty is cheap and if it is not empty it means that kernel just saved you a lot of 8 bytes exists so even after iterating over all the entries there you still saved a lot of time. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 14:53, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote: >> On 2012-10-22 14:18, Avi Kivity wrote: >>> On 10/22/2012 01:45 PM, Jan Kiszka wrote: >>> >>>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() >>>>> is gone. So this will break new userspace, not old. By global you mean >>>>> shared between devices (or memory regions)? >>>> >>>> Yes. We only have a single ring per VM, so we cannot flush multi-second >>>> VGA access separately from other devices. In theory solvable by >>>> introducing per-region rings that can be driven separately. >>> >>> But in practice unneeded. Real time VMs can disable coalescing and not >>> use planar VGA modes. >> >> A) At least right now, we do not differentiate between the VGA modes and >> if flushing is needed. So that device is generally taboo for RT cores of >> the VM. >> B) We need to disable coalescing in E1000 as well - if we want to use >> that model. >> C) Gleb seems to propose using coalescing far beyond those two use cases. >> > Since the userspace change is needed the idea is dead, but if we could > implement it I do not see how it can hurt the latency if it would be the > only mechanism to use coalesced mmio buffer. Checking that the ring buffer > is empty is cheap and if it is not empty it means that kernel just saved > you a lot of 8 bytes exists so even after iterating over all the entries there > you still saved a lot of time. When taking an exit for A, I'm not interesting in flushing stuff for B unless I have a dependency. Thus, buffers would have to be per device before extending their use. Jan
On 10/22/2012 02:53 PM, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote: >> On 2012-10-22 14:18, Avi Kivity wrote: >> > On 10/22/2012 01:45 PM, Jan Kiszka wrote: >> > >> >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() >> >>> is gone. So this will break new userspace, not old. By global you mean >> >>> shared between devices (or memory regions)? >> >> >> >> Yes. We only have a single ring per VM, so we cannot flush multi-second >> >> VGA access separately from other devices. In theory solvable by >> >> introducing per-region rings that can be driven separately. >> > >> > But in practice unneeded. Real time VMs can disable coalescing and not >> > use planar VGA modes. >> >> A) At least right now, we do not differentiate between the VGA modes and >> if flushing is needed. So that device is generally taboo for RT cores of >> the VM. >> B) We need to disable coalescing in E1000 as well - if we want to use >> that model. >> C) Gleb seems to propose using coalescing far beyond those two use cases. >> > Since the userspace change is needed the idea is dead, but if we could > implement it I do not see how it can hurt the latency if it would be the > only mechanism to use coalesced mmio buffer. Checking that the ring buffer > is empty is cheap and if it is not empty it means that kernel just saved > you a lot of 8 bytes exists so even after iterating over all the entries there > you still saved a lot of time. It's time where the guest cannot take interrupts, and time in a high priority guest thread that is spent processing low guest priority requests.
On 10/22/2012 02:45 PM, Jan Kiszka wrote: > On 2012-10-22 14:18, Avi Kivity wrote: >> On 10/22/2012 01:45 PM, Jan Kiszka wrote: >> >>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() >>>> is gone. So this will break new userspace, not old. By global you mean >>>> shared between devices (or memory regions)? >>> >>> Yes. We only have a single ring per VM, so we cannot flush multi-second >>> VGA access separately from other devices. In theory solvable by >>> introducing per-region rings that can be driven separately. >> >> But in practice unneeded. Real time VMs can disable coalescing and not >> use planar VGA modes. > > A) At least right now, we do not differentiate between the VGA modes and > if flushing is needed. So that device is generally taboo for RT cores of > the VM. In non-planar modes the memory will be direct mapped, which overrides coalescing (since kvm or qemu never see an exit). > B) We need to disable coalescing in E1000 as well - if we want to use > that model. True.
On 10/22/2012 02:55 PM, Jan Kiszka wrote: >> Since the userspace change is needed the idea is dead, but if we could >> implement it I do not see how it can hurt the latency if it would be the >> only mechanism to use coalesced mmio buffer. Checking that the ring buffer >> is empty is cheap and if it is not empty it means that kernel just saved >> you a lot of 8 bytes exists so even after iterating over all the entries there >> you still saved a lot of time. > > When taking an exit for A, I'm not interesting in flushing stuff for B > unless I have a dependency. Thus, buffers would have to be per device > before extending their use. Any mmio exit has to flush everything. For example a DMA caused by an e1000 write has to see any writes to the framebuffer, in case the guest is transmitting its framebuffer to the outside world.
On Mon, Oct 22, 2012 at 02:55:14PM +0200, Jan Kiszka wrote: > On 2012-10-22 14:53, Gleb Natapov wrote: > > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote: > >> On 2012-10-22 14:18, Avi Kivity wrote: > >>> On 10/22/2012 01:45 PM, Jan Kiszka wrote: > >>> > >>>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() > >>>>> is gone. So this will break new userspace, not old. By global you mean > >>>>> shared between devices (or memory regions)? > >>>> > >>>> Yes. We only have a single ring per VM, so we cannot flush multi-second > >>>> VGA access separately from other devices. In theory solvable by > >>>> introducing per-region rings that can be driven separately. > >>> > >>> But in practice unneeded. Real time VMs can disable coalescing and not > >>> use planar VGA modes. > >> > >> A) At least right now, we do not differentiate between the VGA modes and > >> if flushing is needed. So that device is generally taboo for RT cores of > >> the VM. > >> B) We need to disable coalescing in E1000 as well - if we want to use > >> that model. > >> C) Gleb seems to propose using coalescing far beyond those two use cases. > >> > > Since the userspace change is needed the idea is dead, but if we could > > implement it I do not see how it can hurt the latency if it would be the > > only mechanism to use coalesced mmio buffer. Checking that the ring buffer > > is empty is cheap and if it is not empty it means that kernel just saved > > you a lot of 8 bytes exists so even after iterating over all the entries there > > you still saved a lot of time. > > When taking an exit for A, I'm not interesting in flushing stuff for B > unless I have a dependency. Thus, buffers would have to be per device > before extending their use. > Buts this is not what will happen (in the absence of other users of coalesced mmio). What will happen is instead of taking 200 exists for B you will take 1 exit for B. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Oct 22, 2012 at 02:55:24PM +0200, Avi Kivity wrote: > On 10/22/2012 02:53 PM, Gleb Natapov wrote: > > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote: > >> On 2012-10-22 14:18, Avi Kivity wrote: > >> > On 10/22/2012 01:45 PM, Jan Kiszka wrote: > >> > > >> >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer() > >> >>> is gone. So this will break new userspace, not old. By global you mean > >> >>> shared between devices (or memory regions)? > >> >> > >> >> Yes. We only have a single ring per VM, so we cannot flush multi-second > >> >> VGA access separately from other devices. In theory solvable by > >> >> introducing per-region rings that can be driven separately. > >> > > >> > But in practice unneeded. Real time VMs can disable coalescing and not > >> > use planar VGA modes. > >> > >> A) At least right now, we do not differentiate between the VGA modes and > >> if flushing is needed. So that device is generally taboo for RT cores of > >> the VM. > >> B) We need to disable coalescing in E1000 as well - if we want to use > >> that model. > >> C) Gleb seems to propose using coalescing far beyond those two use cases. > >> > > Since the userspace change is needed the idea is dead, but if we could > > implement it I do not see how it can hurt the latency if it would be the > > only mechanism to use coalesced mmio buffer. Checking that the ring buffer > > is empty is cheap and if it is not empty it means that kernel just saved > > you a lot of 8 bytes exists so even after iterating over all the entries there > > you still saved a lot of time. > > It's time where the guest cannot take interrupts, and time in a high > priority guest thread that is spent processing low guest priority requests. > Proposed fix has exactly same issue. Until all data is transfered to userspace no interrupt will be served. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/22/2012 03:01 PM, Gleb Natapov wrote: >> It's time where the guest cannot take interrupts, and time in a high >> priority guest thread that is spent processing low guest priority requests. >> > Proposed fix has exactly same issue. Until all data is transfered to > userspace no interrupt will be served. For mmio_fragments that is okay. It's the same guest instruction, and it's still O(1). It's not okay for general mmio coalescing.
On Mon, Oct 22, 2012 at 03:02:22PM +0200, Avi Kivity wrote: > On 10/22/2012 03:01 PM, Gleb Natapov wrote: > > >> It's time where the guest cannot take interrupts, and time in a high > >> priority guest thread that is spent processing low guest priority requests. > >> > > Proposed fix has exactly same issue. Until all data is transfered to > > userspace no interrupt will be served. > > For mmio_fragments that is okay. It's the same guest instruction, and > it's still O(1). > > It's not okay for general mmio coalescing. > Ah, so optimizing mmio_fragments transmission to userspace using dedicated coalesced MMIO buffer should be fine then. Unfortunately, since we cannot use shared ring buffer that exists now, this is too much work for small gain that only new QEMU will be able to enjoy. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 14:58, Avi Kivity wrote: > On 10/22/2012 02:55 PM, Jan Kiszka wrote: >>> Since the userspace change is needed the idea is dead, but if we could >>> implement it I do not see how it can hurt the latency if it would be the >>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer >>> is empty is cheap and if it is not empty it means that kernel just saved >>> you a lot of 8 bytes exists so even after iterating over all the entries there >>> you still saved a lot of time. >> >> When taking an exit for A, I'm not interesting in flushing stuff for B >> unless I have a dependency. Thus, buffers would have to be per device >> before extending their use. > > Any mmio exit has to flush everything. For example a DMA caused by an > e1000 write has to see any writes to the framebuffer, in case the guest > is transmitting its framebuffer to the outside world. We already flush when that crazy guest actually accesses the region, no need to do this unconditionally. Jan
On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote: > On 2012-10-22 14:58, Avi Kivity wrote: > > On 10/22/2012 02:55 PM, Jan Kiszka wrote: > >>> Since the userspace change is needed the idea is dead, but if we could > >>> implement it I do not see how it can hurt the latency if it would be the > >>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer > >>> is empty is cheap and if it is not empty it means that kernel just saved > >>> you a lot of 8 bytes exists so even after iterating over all the entries there > >>> you still saved a lot of time. > >> > >> When taking an exit for A, I'm not interesting in flushing stuff for B > >> unless I have a dependency. Thus, buffers would have to be per device > >> before extending their use. > > > > Any mmio exit has to flush everything. For example a DMA caused by an > > e1000 write has to see any writes to the framebuffer, in case the guest > > is transmitting its framebuffer to the outside world. > > We already flush when that crazy guest actually accesses the region, no > need to do this unconditionally. > What if framebuffer is accessed from inside the kernel? Is this case handled? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 15:08, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote: >> On 2012-10-22 14:58, Avi Kivity wrote: >>> On 10/22/2012 02:55 PM, Jan Kiszka wrote: >>>>> Since the userspace change is needed the idea is dead, but if we could >>>>> implement it I do not see how it can hurt the latency if it would be the >>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer >>>>> is empty is cheap and if it is not empty it means that kernel just saved >>>>> you a lot of 8 bytes exists so even after iterating over all the entries there >>>>> you still saved a lot of time. >>>> >>>> When taking an exit for A, I'm not interesting in flushing stuff for B >>>> unless I have a dependency. Thus, buffers would have to be per device >>>> before extending their use. >>> >>> Any mmio exit has to flush everything. For example a DMA caused by an >>> e1000 write has to see any writes to the framebuffer, in case the guest >>> is transmitting its framebuffer to the outside world. >> >> We already flush when that crazy guest actually accesses the region, no >> need to do this unconditionally. >> > What if framebuffer is accessed from inside the kernel? Is this case handled? Unless I miss a case now, there is no direct access to the framebuffer possible when we are also doing coalescing. Everything needs to go through userspace. Jan
On 10/19/2012 09:37 AM, Xiao Guangrong wrote: > After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), > the pieces of io data can be collected and write them to the guest memory > or MMIO together. > > Unfortunately, kvm splits the mmio access into 8 bytes and store them to > vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it > will cause vcpu->mmio_fragments overflow > > The bug can be exposed by isapc (-M isapc): > > [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC > [ ......] > [23154.858083] Call Trace: > [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] > [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] > [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] > > > Actually, we can use one mmio_fragment to store a large mmio access for the > mmio access is always continuous then split it when we pass the mmio-exit-info > to userspace. Note, there are instructions that can access discontinuous areas. We don't emulate them and they're unlikely to be used for mmio. > After that, we only need two entries to store mmio info for > the cross-mmio pages access Patch is good, but is somewhat large for 3.7. Maybe we can make it smaller with the following: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 8b90dd5..41ceb51 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, > static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, > void *val, int bytes) > { > - struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; > - > - memcpy(vcpu->run->mmio.data, frag->data, frag->len); > return X86EMUL_CONTINUE; > } > > @@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = { > .write = true, > }; > > +static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa, > + unsigned *len, void **data) > +{ > + struct kvm_mmio_fragment *frag; > + int cur = vcpu->mmio_cur_fragment; > + > + if (cur >= vcpu->mmio_nr_fragments) > + return false; > + > + frag = &vcpu->mmio_fragments[cur]; > + if (frag->pos >= frag->len) { > + if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) > + return false; > + frag++; > + } Instead of having ->pos, just adjust ->gpa, ->data, and ->len in place. Then get_current_mmio_info would be unneeded, just the min() bit when accessing ->len. > + > + *gpa = frag->gpa + frag->pos; > + *data = frag->data + frag->pos; > + *len = min(8u, frag->len - frag->pos); > + return true; > +} > + > +static void complete_current_mmio(struct kvm_vcpu *vcpu) > +{ > + struct kvm_mmio_fragment *frag; > + gpa_t gpa; > + unsigned len; > + void *data; > + > + get_current_mmio_info(vcpu, &gpa, &len, &data); > + > + if (!vcpu->mmio_is_write) > + memcpy(data, vcpu->run->mmio.data, len); > + > + /* Increase frag->pos to switch to the next mmio. */ > + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; > + frag->pos += len; > +} > + And this would be unneeded, just adjust the code that does mmio_cur_fragment++: static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - struct kvm_mmio_fragment *frag; + struct kvm_mmio_fragment frag; BUG_ON(!vcpu->mmio_needed); /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + frag = vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + if (frag.len <= 8) { + ++vcpu->mmio_cur_fragment; + } else { + vcpu->mmio_fragments[vcpu->mmio_cur_fragment].len -= frag.len; ...
On Mon, Oct 22, 2012 at 03:25:49PM +0200, Jan Kiszka wrote: > On 2012-10-22 15:08, Gleb Natapov wrote: > > On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote: > >> On 2012-10-22 14:58, Avi Kivity wrote: > >>> On 10/22/2012 02:55 PM, Jan Kiszka wrote: > >>>>> Since the userspace change is needed the idea is dead, but if we could > >>>>> implement it I do not see how it can hurt the latency if it would be the > >>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer > >>>>> is empty is cheap and if it is not empty it means that kernel just saved > >>>>> you a lot of 8 bytes exists so even after iterating over all the entries there > >>>>> you still saved a lot of time. > >>>> > >>>> When taking an exit for A, I'm not interesting in flushing stuff for B > >>>> unless I have a dependency. Thus, buffers would have to be per device > >>>> before extending their use. > >>> > >>> Any mmio exit has to flush everything. For example a DMA caused by an > >>> e1000 write has to see any writes to the framebuffer, in case the guest > >>> is transmitting its framebuffer to the outside world. > >> > >> We already flush when that crazy guest actually accesses the region, no > >> need to do this unconditionally. > >> > > What if framebuffer is accessed from inside the kernel? Is this case handled? > > Unless I miss a case now, there is no direct access to the framebuffer > possible when we are also doing coalescing. Everything needs to go > through userspace. > Yes, with frame buffer is seems to be the case. One can imagine ROMD device that is MMIO on write but still can be accessed for read from kernel, but it cannot be coalesced even if coalesced buffer is flushed on every exit. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-10-22 16:00, Gleb Natapov wrote: > On Mon, Oct 22, 2012 at 03:25:49PM +0200, Jan Kiszka wrote: >> On 2012-10-22 15:08, Gleb Natapov wrote: >>> On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote: >>>> On 2012-10-22 14:58, Avi Kivity wrote: >>>>> On 10/22/2012 02:55 PM, Jan Kiszka wrote: >>>>>>> Since the userspace change is needed the idea is dead, but if we could >>>>>>> implement it I do not see how it can hurt the latency if it would be the >>>>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer >>>>>>> is empty is cheap and if it is not empty it means that kernel just saved >>>>>>> you a lot of 8 bytes exists so even after iterating over all the entries there >>>>>>> you still saved a lot of time. >>>>>> >>>>>> When taking an exit for A, I'm not interesting in flushing stuff for B >>>>>> unless I have a dependency. Thus, buffers would have to be per device >>>>>> before extending their use. >>>>> >>>>> Any mmio exit has to flush everything. For example a DMA caused by an >>>>> e1000 write has to see any writes to the framebuffer, in case the guest >>>>> is transmitting its framebuffer to the outside world. >>>> >>>> We already flush when that crazy guest actually accesses the region, no >>>> need to do this unconditionally. >>>> >>> What if framebuffer is accessed from inside the kernel? Is this case handled? >> >> Unless I miss a case now, there is no direct access to the framebuffer >> possible when we are also doing coalescing. Everything needs to go >> through userspace. >> > Yes, with frame buffer is seems to be the case. One can imagine ROMD > device that is MMIO on write but still can be accessed for read from > kernel, but it cannot be coalesced even if coalesced buffer is flushed > on every exit. Usually, a ROMD device has a stable content as long as it is fast read/slow write. Once it switches mode, it is slow read as well. Jan
On 10/22/2012 04:00 PM, Gleb Natapov wrote: > Yes, with frame buffer is seems to be the case. One can imagine ROMD > device that is MMIO on write but still can be accessed for read from > kernel, but it cannot be coalesced even if coalesced buffer is flushed > on every exit. You cannot enable coalescing on such a device.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b90dd5..41ceb51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; - - memcpy(vcpu->run->mmio.data, frag->data, frag->len); return X86EMUL_CONTINUE; } @@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = { .write = true, }; +static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa, + unsigned *len, void **data) +{ + struct kvm_mmio_fragment *frag; + int cur = vcpu->mmio_cur_fragment; + + if (cur >= vcpu->mmio_nr_fragments) + return false; + + frag = &vcpu->mmio_fragments[cur]; + if (frag->pos >= frag->len) { + if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) + return false; + frag++; + } + + *gpa = frag->gpa + frag->pos; + *data = frag->data + frag->pos; + *len = min(8u, frag->len - frag->pos); + return true; +} + +static void complete_current_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_mmio_fragment *frag; + gpa_t gpa; + unsigned len; + void *data; + + get_current_mmio_info(vcpu, &gpa, &len, &data); + + if (!vcpu->mmio_is_write) + memcpy(data, vcpu->run->mmio.data, len); + + /* Increase frag->pos to switch to the next mmio. */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + frag->pos += len; +} + +static bool vcpu_fill_mmio_exit_info(struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + unsigned len; + void *data; + + if (!get_current_mmio_info(vcpu, &gpa, &len, &data)) + return false; + + vcpu->run->mmio.len = len; + vcpu->run->mmio.is_write = vcpu->mmio_is_write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = gpa; + + if (vcpu->mmio_is_write) + memcpy(vcpu->run->mmio.data, data, len); + return true; +} + static int emulator_read_write_onepage(unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, @@ -3834,18 +3889,12 @@ mmio: bytes -= handled; val += handled; - while (bytes) { - unsigned now = min(bytes, 8U); - - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; - frag->gpa = gpa; - frag->data = val; - frag->len = now; - - gpa += now; - val += now; - bytes -= now; - } + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->pos = 0; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; return X86EMUL_CONTINUE; } @@ -3855,7 +3904,6 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, const struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; int rc; if (ops->read_write_prepare && @@ -3887,17 +3935,13 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, if (!vcpu->mmio_nr_fragments) return rc; - gpa = vcpu->mmio_fragments[0].gpa; - vcpu->mmio_needed = 1; vcpu->mmio_cur_fragment = 0; + vcpu->mmio_is_write = ops->write; - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; - vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = gpa; - - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); + vcpu_fill_mmio_exit_info(vcpu); + return ops->read_write_exit_mmio(vcpu, vcpu->mmio_fragments[0].gpa, + val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, @@ -5524,43 +5568,34 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) * * read: * for each fragment - * write gpa, len - * exit - * copy data + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data * execute insn * * write: * for each fragment - * write gpa, len - * copy data - * exit + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit */ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; - struct kvm_mmio_fragment *frag; - BUG_ON(!vcpu->mmio_needed); - /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; - if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + complete_current_mmio(vcpu); + + /* Initiate next fragment */ + if (!vcpu_fill_mmio_exit_info(vcpu)) { vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; return complete_emulated_io(vcpu); } - /* Initiate next fragment */ - ++frag; - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = frag->gpa; - if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; - run->mmio.is_write = vcpu->mmio_is_write; + vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e235068..f6c3c2f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -42,19 +42,8 @@ */ #define KVM_MEMSLOT_INVALID (1UL << 16) -/* - * If we support unaligned MMIO, at most one fragment will be split into two: - */ -#ifdef KVM_UNALIGNED_MMIO -# define KVM_EXTRA_MMIO_FRAGMENTS 1 -#else -# define KVM_EXTRA_MMIO_FRAGMENTS 0 -#endif - -#define KVM_USER_MMIO_SIZE 8 - -#define KVM_MAX_MMIO_FRAGMENTS \ - (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) +/* Two fragments for cross MMIO pages. */ +#define KVM_MAX_MMIO_FRAGMENTS 2 /* * For the normal pfn, the highest 12 bits should be zero, @@ -203,6 +192,7 @@ struct kvm_mmio_fragment { gpa_t gpa; void *data; unsigned len; + unsigned pos; }; struct kvm_vcpu {
After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), the pieces of io data can be collected and write them to the guest memory or MMIO together. Unfortunately, kvm splits the mmio access into 8 bytes and store them to vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it will cause vcpu->mmio_fragments overflow The bug can be exposed by isapc (-M isapc): [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [ ......] [23154.858083] Call Trace: [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm] [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] Actually, we can use one mmio_fragment to store a large mmio access for the mmio access is always continuous then split it when we pass the mmio-exit-info to userspace. After that, we only need two entries to store mmio info for the cross-mmio pages access Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> --- arch/x86/kvm/x86.c | 127 +++++++++++++++++++++++++++++----------------- include/linux/kvm_host.h | 16 +----- 2 files changed, 84 insertions(+), 59 deletions(-)