Message ID | 20090622151631.GA14780@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Michael S. Tsirkin wrote: > It seems that a lot of complexity and trickiness with iosignalfd is > handling the group/item relationship, which comes about because kvm does > not currently let a device on the bus claim a write transaction based on the > value written. This could be greatly simplified if the value written > was passed to the in_range check for write operation. We could then > simply make each kvm_iosignalfd a device on the bus. > > What does everyone think of the following lightly tested patch? > Hi Michael, Its interesting, but I am not convinced its necessary. We created the group/item layout because iosignalfds are unique in that they are probably the only IO device that wants to do some kind of address aliasing. With what you are proposing here, you are adding aliasing support to the general infrastructure which I am not (yet) convinced is necessary. If there isn't a use case for other devices to have aliasing, I would think the logic is best contained in iosignalfd. Do you have something in mind? Kind Regards, -Greg > --> > > Subject: kvm: pass value to in_range callback > > For write transactions, pass the value written to in_range checks so > that we can make each iosignalfd a separate device on kvm bus. > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > --- > > > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h > index 55e8846..98a25af 100644 > --- a/virt/kvm/iodev.h > +++ b/virt/kvm/iodev.h > @@ -28,7 +28,7 @@ struct kvm_io_device { > int len, > const void *val); > int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, > - int is_write); > + int is_write, void *write_val); > void (*destructor)(struct kvm_io_device *this); > > void *private; > @@ -51,9 +51,10 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev, > } > > static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, > - gpa_t addr, int len, int is_write) > + gpa_t addr, int len, int is_write, > + void *write_val) > { > - return dev->in_range(dev, addr, len, is_write); > + return dev->in_range(dev, addr, len, is_write, write_val); > } > > static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) > diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c > index 80c57b0..8cfdf9d 100644 > --- a/arch/ia64/kvm/kvm-ia64.c > +++ b/arch/ia64/kvm/kvm-ia64.c > @@ -211,11 +211,13 @@ int kvm_dev_ioctl_check_extension(long ext) > } > > static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, > - gpa_t addr, int len, int is_write) > + gpa_t addr, int len, int is_write, > + void *write_val) > { > struct kvm_io_device *dev; > > - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write); > + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write, > + write_val); > > return dev; > } > @@ -247,7 +247,8 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > kvm_run->exit_reason = KVM_EXIT_MMIO; > return 0; > mmio: > - mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); > + mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, > + !p->dir, &p->data); > if (mmio_dev) { > if (!p->dir) > kvm_iodevice_write(mmio_dev, p->addr, p->size, > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c > index 4d6f0d2..5ba21ff 100644 > --- a/arch/x86/kvm/i8254.c > +++ b/arch/x86/kvm/i8254.c > @@ -485,7 +485,7 @@ static void pit_ioport_read(struct kvm_io_device *this, > } > > static int pit_in_range(struct kvm_io_device *this, gpa_t addr, > - int len, int is_write) > + int len, int is_write, void *write_val) > { > return ((addr >= KVM_PIT_BASE_ADDRESS) && > (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index ae99d83..456fd53 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -721,7 +721,7 @@ static void apic_mmio_write(struct kvm_io_device *this, > } > > static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, > - int len, int size) > + int len, int is_write, void *write_val) > { > struct kvm_lapic *apic = (struct kvm_lapic *)this->private; > int ret = 0; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 249540f..9d29017 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2043,13 +2043,13 @@ static void kvm_init_msr_list(void) > */ > static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, > gpa_t addr, int len, > - int is_write) > + int is_write, void *write_val) > { > struct kvm_io_device *dev; > > if (vcpu->arch.apic) { > dev = &vcpu->arch.apic->dev; > - if (dev->in_range(dev, addr, len, is_write)) > + if (dev->in_range(dev, addr, len, is_write, write_val)) > return dev; > } > return NULL; > @@ -2058,14 +2058,14 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, > > static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, > gpa_t addr, int len, > - int is_write) > + int is_write, void *write_val) > { > struct kvm_io_device *dev; > > - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); > + dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val); > if (dev == NULL) > dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, > - is_write); > + is_write, write_val); > return dev; > } > > @@ -2161,7 +2161,7 @@ mmio: > * Is this MMIO handled locally? > */ > mutex_lock(&vcpu->kvm->lock); > - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); > + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL); > if (mmio_dev) { > kvm_iodevice_read(mmio_dev, gpa, bytes, val); > mutex_unlock(&vcpu->kvm->lock); > @@ -2216,7 +2216,7 @@ mmio: > * Is this MMIO handled locally? > */ > mutex_lock(&vcpu->kvm->lock); > - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); > + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val); > if (mmio_dev) { > kvm_iodevice_write(mmio_dev, gpa, bytes, val); > mutex_unlock(&vcpu->kvm->lock); > @@ -2576,9 +2576,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev, > > static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, > gpa_t addr, int len, > - int is_write) > + int is_write, void *write_val) > { > - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); > + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write, > + write_val); > } > > int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > @@ -2608,7 +2608,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > val = kvm_register_read(vcpu, VCPU_REGS_RAX); > memcpy(vcpu->arch.pio_data, &val, 4); > > - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); > + pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in, > + vcpu->arch.pio_data); > if (pio_dev) { > kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); > complete_pio(vcpu); > @@ -2624,7 +2625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > { > unsigned now, in_page; > int ret = 0; > - struct kvm_io_device *pio_dev; > > vcpu->run->exit_reason = KVM_EXIT_IO; > vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; > @@ -2672,9 +2672,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > vcpu->arch.pio.guest_gva = address; > > - pio_dev = vcpu_find_pio_dev(vcpu, port, > - vcpu->arch.pio.cur_count, > - !vcpu->arch.pio.in); > if (!vcpu->arch.pio.in) { > /* string PIO write */ > ret = pio_copy_data(vcpu); > @@ -2682,13 +2679,20 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > kvm_inject_gp(vcpu, 0); > return 1; > } > - if (ret == 0 && pio_dev) { > - pio_string_write(pio_dev, vcpu); > - complete_pio(vcpu); > - if (vcpu->arch.pio.count == 0) > - ret = 1; > + if (ret == 0) { > + struct kvm_io_device *pio_dev; > + pio_dev = vcpu_find_pio_dev(vcpu, port, > + vcpu->arch.pio.cur_count, > + 1, vcpu->arch.pio_data); > + if (pio_dev) { > + pio_string_write(pio_dev, vcpu); > + complete_pio(vcpu); > + if (vcpu->arch.pio.count == 0) > + ret = 1; > + } > } > - } else if (pio_dev) > + } else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0, > + NULL)) > pr_unimpl(vcpu, "no string pio read support yet, " > "port %x size %d count %ld\n", > port, size, count); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index aacc544..0fb7938 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -60,7 +60,8 @@ struct kvm_io_bus { > void kvm_io_bus_init(struct kvm_io_bus *bus); > void kvm_io_bus_destroy(struct kvm_io_bus *bus); > struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, > - gpa_t addr, int len, int is_write); > + gpa_t addr, int len, int is_write, > + void *write_val); > void kvm_io_bus_register_dev(struct kvm_io_bus *bus, > struct kvm_io_device *dev); > > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c > index 5ae620d..3a1cbfd 100644 > --- a/virt/kvm/coalesced_mmio.c > +++ b/virt/kvm/coalesced_mmio.c > @@ -15,7 +15,8 @@ > #include "coalesced_mmio.h" > > static int coalesced_mmio_in_range(struct kvm_io_device *this, > - gpa_t addr, int len, int is_write) > + gpa_t addr, int len, int is_write, > + void *write_val) > { > struct kvm_coalesced_mmio_dev *dev = > (struct kvm_coalesced_mmio_dev*)this->private; > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c > index 1eddae9..2adbb1b 100644 > --- a/virt/kvm/ioapic.c > +++ b/virt/kvm/ioapic.c > @@ -221,7 +221,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) > } > > static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, > - int len, int is_write) > + int len, int is_write, > + void *write_val) > { > struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 7645543..b97e390 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2445,14 +2445,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) > } > > struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, > - gpa_t addr, int len, int is_write) > + gpa_t addr, int len, int is_write, > + void *write_val) > { > int i; > > for (i = 0; i < bus->dev_count; i++) { > struct kvm_io_device *pos = bus->devs[i]; > > - if (pos->in_range(pos, addr, len, is_write)) > + if (pos->in_range(pos, addr, len, is_write, write_val)) > return pos; > } > >
On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > It seems that a lot of complexity and trickiness with iosignalfd is > > handling the group/item relationship, which comes about because kvm does > > not currently let a device on the bus claim a write transaction based on the > > value written. This could be greatly simplified if the value written > > was passed to the in_range check for write operation. We could then > > simply make each kvm_iosignalfd a device on the bus. > > > > What does everyone think of the following lightly tested patch? > > > > Hi Michael, > Its interesting, but I am not convinced its necessary. We created the > group/item layout because iosignalfds are unique in that they are > probably the only IO device that wants to do some kind of address > aliasing. We actually already have aliasing: is_write flag is used for this purpose. Actually, it's possible to remove is_write by passing a null pointer in write_val for reads. I like this a bit less as the code generated is less compact ... Avi, what do you think? > With what you are proposing here, you are adding aliasing > support to the general infrastructure which I am not (yet) convinced is > necessary. Infrastructure is a big name for something that adds a total of 10 lines to kvm. And it should at least halve the size of your 450-line patch. > If there isn't a use case for other devices to have > aliasing, I would think the logic is best contained in iosignalfd. Do > you have something in mind? One is enough :) Seriously, do you see that this saves you all of RCU, linked lists and counters? You don't need to keep track of iofds, you don't need to implement your own lookup logic - you just use the kvm device and that's it. > Kind Regards, > -Greg > > > --> > > > > Subject: kvm: pass value to in_range callback > > > > For write transactions, pass the value written to in_range checks so > > that we can make each iosignalfd a separate device on kvm bus. > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > > > --- > > > > > > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h > > index 55e8846..98a25af 100644 > > --- a/virt/kvm/iodev.h > > +++ b/virt/kvm/iodev.h > > @@ -28,7 +28,7 @@ struct kvm_io_device { > > int len, > > const void *val); > > int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, > > - int is_write); > > + int is_write, void *write_val); > > void (*destructor)(struct kvm_io_device *this); > > > > void *private; > > @@ -51,9 +51,10 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev, > > } > > > > static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, > > - gpa_t addr, int len, int is_write) > > + gpa_t addr, int len, int is_write, > > + void *write_val) > > { > > - return dev->in_range(dev, addr, len, is_write); > > + return dev->in_range(dev, addr, len, is_write, write_val); > > } > > > > static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) > > diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c > > index 80c57b0..8cfdf9d 100644 > > --- a/arch/ia64/kvm/kvm-ia64.c > > +++ b/arch/ia64/kvm/kvm-ia64.c > > @@ -211,11 +211,13 @@ int kvm_dev_ioctl_check_extension(long ext) > > } > > > > static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, > > - gpa_t addr, int len, int is_write) > > + gpa_t addr, int len, int is_write, > > + void *write_val) > > { > > struct kvm_io_device *dev; > > > > - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write); > > + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write, > > + write_val); > > > > return dev; > > } > > @@ -247,7 +247,8 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > kvm_run->exit_reason = KVM_EXIT_MMIO; > > return 0; > > mmio: > > - mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); > > + mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, > > + !p->dir, &p->data); > > if (mmio_dev) { > > if (!p->dir) > > kvm_iodevice_write(mmio_dev, p->addr, p->size, > > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c > > index 4d6f0d2..5ba21ff 100644 > > --- a/arch/x86/kvm/i8254.c > > +++ b/arch/x86/kvm/i8254.c > > @@ -485,7 +485,7 @@ static void pit_ioport_read(struct kvm_io_device *this, > > } > > > > static int pit_in_range(struct kvm_io_device *this, gpa_t addr, > > - int len, int is_write) > > + int len, int is_write, void *write_val) > > { > > return ((addr >= KVM_PIT_BASE_ADDRESS) && > > (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > > index ae99d83..456fd53 100644 > > --- a/arch/x86/kvm/lapic.c > > +++ b/arch/x86/kvm/lapic.c > > @@ -721,7 +721,7 @@ static void apic_mmio_write(struct kvm_io_device *this, > > } > > > > static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, > > - int len, int size) > > + int len, int is_write, void *write_val) > > { > > struct kvm_lapic *apic = (struct kvm_lapic *)this->private; > > int ret = 0; > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 249540f..9d29017 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -2043,13 +2043,13 @@ static void kvm_init_msr_list(void) > > */ > > static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, > > gpa_t addr, int len, > > - int is_write) > > + int is_write, void *write_val) > > { > > struct kvm_io_device *dev; > > > > if (vcpu->arch.apic) { > > dev = &vcpu->arch.apic->dev; > > - if (dev->in_range(dev, addr, len, is_write)) > > + if (dev->in_range(dev, addr, len, is_write, write_val)) > > return dev; > > } > > return NULL; > > @@ -2058,14 +2058,14 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, > > > > static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, > > gpa_t addr, int len, > > - int is_write) > > + int is_write, void *write_val) > > { > > struct kvm_io_device *dev; > > > > - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); > > + dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val); > > if (dev == NULL) > > dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, > > - is_write); > > + is_write, write_val); > > return dev; > > } > > > > @@ -2161,7 +2161,7 @@ mmio: > > * Is this MMIO handled locally? > > */ > > mutex_lock(&vcpu->kvm->lock); > > - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); > > + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL); > > if (mmio_dev) { > > kvm_iodevice_read(mmio_dev, gpa, bytes, val); > > mutex_unlock(&vcpu->kvm->lock); > > @@ -2216,7 +2216,7 @@ mmio: > > * Is this MMIO handled locally? > > */ > > mutex_lock(&vcpu->kvm->lock); > > - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); > > + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val); > > if (mmio_dev) { > > kvm_iodevice_write(mmio_dev, gpa, bytes, val); > > mutex_unlock(&vcpu->kvm->lock); > > @@ -2576,9 +2576,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev, > > > > static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, > > gpa_t addr, int len, > > - int is_write) > > + int is_write, void *write_val) > > { > > - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); > > + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write, > > + write_val); > > } > > > > int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > @@ -2608,7 +2608,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > val = kvm_register_read(vcpu, VCPU_REGS_RAX); > > memcpy(vcpu->arch.pio_data, &val, 4); > > > > - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); > > + pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in, > > + vcpu->arch.pio_data); > > if (pio_dev) { > > kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); > > complete_pio(vcpu); > > @@ -2624,7 +2625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > { > > unsigned now, in_page; > > int ret = 0; > > - struct kvm_io_device *pio_dev; > > > > vcpu->run->exit_reason = KVM_EXIT_IO; > > vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; > > @@ -2672,9 +2672,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > > > vcpu->arch.pio.guest_gva = address; > > > > - pio_dev = vcpu_find_pio_dev(vcpu, port, > > - vcpu->arch.pio.cur_count, > > - !vcpu->arch.pio.in); > > if (!vcpu->arch.pio.in) { > > /* string PIO write */ > > ret = pio_copy_data(vcpu); > > @@ -2682,13 +2679,20 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, > > kvm_inject_gp(vcpu, 0); > > return 1; > > } > > - if (ret == 0 && pio_dev) { > > - pio_string_write(pio_dev, vcpu); > > - complete_pio(vcpu); > > - if (vcpu->arch.pio.count == 0) > > - ret = 1; > > + if (ret == 0) { > > + struct kvm_io_device *pio_dev; > > + pio_dev = vcpu_find_pio_dev(vcpu, port, > > + vcpu->arch.pio.cur_count, > > + 1, vcpu->arch.pio_data); > > + if (pio_dev) { > > + pio_string_write(pio_dev, vcpu); > > + complete_pio(vcpu); > > + if (vcpu->arch.pio.count == 0) > > + ret = 1; > > + } > > } > > - } else if (pio_dev) > > + } else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0, > > + NULL)) > > pr_unimpl(vcpu, "no string pio read support yet, " > > "port %x size %d count %ld\n", > > port, size, count); > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index aacc544..0fb7938 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -60,7 +60,8 @@ struct kvm_io_bus { > > void kvm_io_bus_init(struct kvm_io_bus *bus); > > void kvm_io_bus_destroy(struct kvm_io_bus *bus); > > struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, > > - gpa_t addr, int len, int is_write); > > + gpa_t addr, int len, int is_write, > > + void *write_val); > > void kvm_io_bus_register_dev(struct kvm_io_bus *bus, > > struct kvm_io_device *dev); > > > > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c > > index 5ae620d..3a1cbfd 100644 > > --- a/virt/kvm/coalesced_mmio.c > > +++ b/virt/kvm/coalesced_mmio.c > > @@ -15,7 +15,8 @@ > > #include "coalesced_mmio.h" > > > > static int coalesced_mmio_in_range(struct kvm_io_device *this, > > - gpa_t addr, int len, int is_write) > > + gpa_t addr, int len, int is_write, > > + void *write_val) > > { > > struct kvm_coalesced_mmio_dev *dev = > > (struct kvm_coalesced_mmio_dev*)this->private; > > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c > > index 1eddae9..2adbb1b 100644 > > --- a/virt/kvm/ioapic.c > > +++ b/virt/kvm/ioapic.c > > @@ -221,7 +221,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) > > } > > > > static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, > > - int len, int is_write) > > + int len, int is_write, > > + void *write_val) > > { > > struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index 7645543..b97e390 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -2445,14 +2445,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) > > } > > > > struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, > > - gpa_t addr, int len, int is_write) > > + gpa_t addr, int len, int is_write, > > + void *write_val) > > { > > int i; > > > > for (i = 0; i < bus->dev_count; i++) { > > struct kvm_io_device *pos = bus->devs[i]; > > > > - if (pos->in_range(pos, addr, len, is_write)) > > + if (pos->in_range(pos, addr, len, is_write, write_val)) > > return pos; > > } > > > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: > On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> It seems that a lot of complexity and trickiness with iosignalfd is >>> handling the group/item relationship, which comes about because kvm does >>> not currently let a device on the bus claim a write transaction based on the >>> value written. This could be greatly simplified if the value written >>> was passed to the in_range check for write operation. We could then >>> simply make each kvm_iosignalfd a device on the bus. >>> >>> What does everyone think of the following lightly tested patch? >>> >>> >> Hi Michael, >> Its interesting, but I am not convinced its necessary. We created the >> group/item layout because iosignalfds are unique in that they are >> probably the only IO device that wants to do some kind of address >> aliasing. >> > > We actually already have aliasing: is_write flag is used for this > purpose. Yes, but read/write address aliasing is not the same thing is multi-match data aliasing. Besides, your proposal also breaks some of the natural relationship models (e.g. all the aliased iosignal_items always belong to the same underlying device. io_bus entries have an arbitrary topology). > Actually, it's possible to remove is_write by passing > a null pointer in write_val for reads. I like this a bit less as > the code generated is less compact ... Avi, what do you think? > > >> With what you are proposing here, you are adding aliasing >> support to the general infrastructure which I am not (yet) convinced is >> necessary. >> > > Infrastructure is a big name for something that adds a total of 10 lines to kvm. > And it should at least halve the size of your 450-line patch. > Your patch isn't complete until some critical missing features are added to io_bus, though, so its not really just 10 lines. For one, it will need to support much more than 6 devices. It will also need to support multiple matches. Also you are proposing an general interface change that doesn't make sense to all but one device type. So now every io-device developer that comes along will scratch their head at what to do with that field. None of these are insurmountable hurdles, but my point is that today the complexity is encapsulated in the proper place IMO. E.g. The one and only device that cares to do this "weird" thing handles it behind an interface that makes sense to all parties involved. > >> If there isn't a use case for other devices to have >> aliasing, I would think the logic is best contained in iosignalfd. Do >> you have something in mind? >> > > One is enough :) > I am not convinced yet. ;) It appears to me that we are leaking iosignalfd-isms into the general code. If there is another device that wants to do something similar, ok. But I can't think of any. > Seriously, do you see that this saves you all of RCU, linked lists and > counters? Well, also keep in mind we will probably be converting io_bus to RCU very soon, so we are going the opposite direction ;) Kind Regards, -Greg
On Mon, Jun 22, 2009 at 12:29:10PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: > > > >> Michael S. Tsirkin wrote: > >> > >>> It seems that a lot of complexity and trickiness with iosignalfd is > >>> handling the group/item relationship, which comes about because kvm does > >>> not currently let a device on the bus claim a write transaction based on the > >>> value written. This could be greatly simplified if the value written > >>> was passed to the in_range check for write operation. We could then > >>> simply make each kvm_iosignalfd a device on the bus. > >>> > >>> What does everyone think of the following lightly tested patch? > >>> > >>> > >> Hi Michael, > >> Its interesting, but I am not convinced its necessary. We created the > >> group/item layout because iosignalfds are unique in that they are > >> probably the only IO device that wants to do some kind of address > >> aliasing. > >> > > > > We actually already have aliasing: is_write flag is used for this > > purpose. > > Yes, but read/write address aliasing is not the same thing is > multi-match data aliasing. What's the big difference? > Besides, your proposal also breaks s/break/removes limitation/ :) > some of > the natural relationship models > (e.g. all the aliased iosignal_items > always belong to the same underlying device. io_bus entries have an > arbitrary topology). iosignal_item is an artifact, they are not seen by user - they are just a work around an API limitation. And they are only grouped if the same PIO offset is used for all accesses. Why is not always the case. If a device uses several PIO offsets (as virtio does), you create separate devices for a single guest device too. > > > Actually, it's possible to remove is_write by passing > > a null pointer in write_val for reads. I like this a bit less as > > the code generated is less compact ... Avi, what do you think? > > > > > >> With what you are proposing here, you are adding aliasing > >> support to the general infrastructure which I am not (yet) convinced is > >> necessary. > >> > > > > Infrastructure is a big name for something that adds a total of 10 lines to kvm. > > And it should at least halve the size of your 450-line patch. > > > > Your patch isn't complete until some critical missing features are added > to io_bus, though, so its not really just 10 lines. > For one, it will > need to support much more than 6 devices. Isn't this like a #define change? With the item patch we are still limited in the number of groups we can create. What we gain is a simple array/list instead of a tree of linked lists that makes cheshire cheese out of CPU data cache. > It will also need to support > multiple matches. What, signal many fds on the same address/value pair? I see this as a bug. Why is this a good thing to support? Just increases the chance of leaking this fd. > Also you are proposing an general interface change > that doesn't make sense to all but one device type. So now every > io-device developer that comes along will scratch their head at what to > do with that field. What do they do with is_write now? Ignore it. It's used in a whole of 1 place. > > None of these are insurmountable hurdles, but my point is that today the > complexity is encapsulated in the proper place IMO. It's better to get rid of complexity than encapsulate it. > E.g. The one and > only device that cares to do this "weird" thing handles it behind an > interface that makes sense to all parties involved. > > > >> If there isn't a use case for other devices to have > >> aliasing, I would think the logic is best contained in iosignalfd. Do > >> you have something in mind? > >> > > > > One is enough :) > > > > I am not convinced yet. ;) It appears to me that we are leaking > iosignalfd-isms into the general code. If there is another device that > wants to do something similar, ok. But I can't think of any. You never know. is_write was used by a whole of 1 user: coalesced_mmio, then your patch comes along ... > > Seriously, do you see that this saves you all of RCU, linked lists and > > counters? > > Well, also keep in mind we will probably be converting io_bus to RCU > very soon, so we are going the opposite direction ;) > > Kind Regards, > -Greg > Same direction. Let's put RCU in iobus, we don't need another one on top of it. That's encapsulating complexity. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: > On Mon, Jun 22, 2009 at 12:29:10PM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: >>> >>> >>>> Michael S. Tsirkin wrote: >>>> >>>> >>>>> It seems that a lot of complexity and trickiness with iosignalfd is >>>>> handling the group/item relationship, which comes about because kvm does >>>>> not currently let a device on the bus claim a write transaction based on the >>>>> value written. This could be greatly simplified if the value written >>>>> was passed to the in_range check for write operation. We could then >>>>> simply make each kvm_iosignalfd a device on the bus. >>>>> >>>>> What does everyone think of the following lightly tested patch? >>>>> >>>>> >>>>> >>>> Hi Michael, >>>> Its interesting, but I am not convinced its necessary. We created the >>>> group/item layout because iosignalfds are unique in that they are >>>> probably the only IO device that wants to do some kind of address >>>> aliasing. >>>> >>>> >>> We actually already have aliasing: is_write flag is used for this >>> purpose. >>> >> Yes, but read/write address aliasing is not the same thing is >> multi-match data aliasing. >> > > What's the big difference? > Well, for one its not very clear what the benefit of the read/write aliasing even is. ;) Apparently coalesced_mmio uses it, but even so I doubt that is for the purposes of having one device do reads while another does writes. I could be wrong, though. > >> Besides, your proposal also breaks >> > > s/break/removes limitation/ :) > > >> some of >> the natural relationship models >> (e.g. all the aliased iosignal_items >> always belong to the same underlying device. io_bus entries have an >> arbitrary topology). >> > > iosignal_item is an artifact, they are not seen by user - > they are just a work around an API limitation. > Well, not really. The "limitation" is a natural attribute of real hardware too. I don't know of a whole bunch of real hardware that aliases multiple address decoders to the same value, do you? How would you handle reads? The way I think of these items are not as unique devices in the io_bus sense. We have one address decoder (the group) listening on the bus to a unique address. That group provides a data-decoding service, where you can program what event gets triggered for which value (or wildcard = all values). It is unnatural to turn it around and say that the io_bus now scans exhaustively on the address/data tuple, and we support aliasing such that multiple decoders may coexist. There isn't any other device (hardware or software) that will actually do this aside from iosignalfd, to my knowledge. So why distort the entire io_bus subsystem's addressing scheme to match? I don't get the motivation, as it appears to me to be already handled ideally as it is. Perhaps I am contributing to this confusion with my decision to make the group creation implicit with the first item creation. Would you feel better about this if we made the distinction with group and item more explicit? (E.g. IOSIGNALFD_GROUP_ASSIGN and IOSIGNALFD_ITEM_ASSIGN verbs). I am fine with that change, I suppose. > And they are only grouped if the same PIO offset is used for all accesses. > Why is not always the case. If a device uses several PIO offsets > (as virtio does), you create separate devices for a single guest device too. > Thats ok. As I stated above, I view this as a data-match service against a single address decoder, not as multiple decoders aliased to the same address/data pair. Its perfectly fine for a logical device to employ multiple decoders, but its unnatural for there to be multiple decoders to the same address. Its also unnatural to use the data as part of the decode criteria for a general IO cycle. > >>> Actually, it's possible to remove is_write by passing >>> a null pointer in write_val for reads. I like this a bit less as >>> the code generated is less compact ... Avi, what do you think? >>> >>> >>> >>>> With what you are proposing here, you are adding aliasing >>>> support to the general infrastructure which I am not (yet) convinced is >>>> necessary. >>>> >>>> >>> Infrastructure is a big name for something that adds a total of 10 lines to kvm. >>> And it should at least halve the size of your 450-line patch. >>> >>> >> Your patch isn't complete until some critical missing features are added >> to io_bus, though, so its not really just 10 lines. >> For one, it will >> need to support much more than 6 devices. >> > > Isn't this like a #define change? With the item patch we are still > limited in the number of groups we can create. > Well, no because we want to support ~512 of these things, so the array is not going to scale. Right now iosignalfd is implemented as a tree+list, which is a little better for scalability but even that needs to change soon. We probably ultimately want to do either a rbtree or radixtree for all of these things. But that is an optimization patch for a later day ;) > What we gain is a simple array/list instead of a tree of > linked lists that makes cheshire cheese out of CPU data cache. > > Yeah, that is a down side, I admit. But the current io_bus array will probably not scale going forward either so it needs to be addressed on that side as well. We can always implement a SW cache, if need be, to account for this. >> It will also need to support >> multiple matches. >> > > What, signal many fds on the same address/value pair? > I see this as a bug. Why is this a good thing to support? > Just increases the chance of leaking this fd. > I believe Avi asked for this feature specifically, so I will defer to him. > >> Also you are proposing an general interface change >> that doesn't make sense to all but one device type. So now every >> io-device developer that comes along will scratch their head at what to >> do with that field. >> > > What do they do with is_write now? Ignore it. It's used in a whole > of 1 place. > > >> None of these are insurmountable hurdles, but my point is that today the >> complexity is encapsulated in the proper place IMO. >> > > It's better to get rid of complexity than encapsulate it. > All you are doing is moving the complexity to a place where I don't see it serving a benefit to any code other than the one you just took it away from. Like it or not, io_bus will have to address scale and data-matching to do what you want. And the address/data match doesn't make sense to anyone else afaict. In addition, data-matching at that level is even harder because we can make little simplifications with the way we do things now (like enforce that all add/lens conform to the group addr/len). If you go to a general address/data tuple decoder, you will presumably need to have a much more flexible decoder design (overlapping regions, etc). Again, not insurmountable...but I am not seeing the advantage to make this change worthwhile either. > >> E.g. The one and >> only device that cares to do this "weird" thing handles it behind an >> interface that makes sense to all parties involved. >> >>> >>> >>>> If there isn't a use case for other devices to have >>>> aliasing, I would think the logic is best contained in iosignalfd. Do >>>> you have something in mind? >>>> >>>> >>> One is enough :) >>> >>> >> I am not convinced yet. ;) It appears to me that we are leaking >> iosignalfd-isms into the general code. If there is another device that >> wants to do something similar, ok. But I can't think of any. >> > > You never know. is_write was used by a whole of 1 user: coalesced_mmio, > then your patch comes along ... > Can it wait until that happens, then? :) I'm already at v8 and a v9 is brewing. It's not like we can't change later if something better comes along. > > >>> Seriously, do you see that this saves you all of RCU, linked lists and >>> counters? >>> >> Well, also keep in mind we will probably be converting io_bus to RCU >> very soon, so we are going the opposite direction ;) >> >> Kind Regards, >> -Greg >> >> > > Same direction. Let's put RCU in iobus, we don't need another one on > top of it. That's encapsulating complexity. > Is this really all that complicated? The iosignalfd code is fairly trivial even with the group/item nesting IMO. I also think it makes sense to live where it is. Thanks Michael, -Greg
On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote: > On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> It seems that a lot of complexity and trickiness with iosignalfd is >>> handling the group/item relationship, which comes about because kvm does >>> not currently let a device on the bus claim a write transaction based on the >>> value written. This could be greatly simplified if the value written >>> was passed to the in_range check for write operation. We could then >>> simply make each kvm_iosignalfd a device on the bus. >>> >>> What does everyone think of the following lightly tested patch? >>> >>> >> Hi Michael, >> Its interesting, but I am not convinced its necessary. We created the >> group/item layout because iosignalfds are unique in that they are >> probably the only IO device that wants to do some kind of address >> aliasing. >> > > We actually already have aliasing: is_write flag is used for this > purpose. Actually, it's possible to remove is_write by passing > a null pointer in write_val for reads. I like this a bit less as > the code generated is less compact ... Avi, what do you think? > Greg, won't Michael's patch eliminate a big chunk from your iosignalfd patches? Seems like a win to me. > One is enough :) > Seriously, do you see that this saves you all of RCU, linked lists and > counters? You don't need to keep track of iofds, you don't need to > implement your own lookup logic - you just use the kvm device > and that's it. > > Yup.
On 06/22/2009 07:29 PM, Gregory Haskins wrote: >> We actually already have aliasing: is_write flag is used for this >> purpose. >> > > Yes, but read/write address aliasing is not the same thing is > multi-match data aliasing. Besides, your proposal also breaks some of > the natural relationship models (e.g. all the aliased iosignal_items > always belong to the same underlying device. io_bus entries have an > arbitrary topology). > It's all one big hack, we want to get the correct function called with as little fuss as possible.
Avi Kivity wrote: > On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote: >> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: >> >>> Michael S. Tsirkin wrote: >>> >>>> It seems that a lot of complexity and trickiness with iosignalfd is >>>> handling the group/item relationship, which comes about because kvm >>>> does >>>> not currently let a device on the bus claim a write transaction >>>> based on the >>>> value written. This could be greatly simplified if the value written >>>> was passed to the in_range check for write operation. We could then >>>> simply make each kvm_iosignalfd a device on the bus. >>>> >>>> What does everyone think of the following lightly tested patch? >>>> >>>> >>> Hi Michael, >>> Its interesting, but I am not convinced its necessary. We >>> created the >>> group/item layout because iosignalfds are unique in that they are >>> probably the only IO device that wants to do some kind of address >>> aliasing. >>> >> >> We actually already have aliasing: is_write flag is used for this >> purpose. Actually, it's possible to remove is_write by passing >> a null pointer in write_val for reads. I like this a bit less as >> the code generated is less compact ... Avi, what do you think? >> > > Greg, won't Michael's patch eliminate a big chunk from your iosignalfd > patches? Seems like a win to me. Well, it really just moves that hunk from eventfd.c to kvm_main.c, where I don't think anyone else will use it by iosignalfd. But if that is what everyone wants, I guess I have no choice. > >> One is enough :) >> Seriously, do you see that this saves you all of RCU, linked lists and >> counters? You don't need to keep track of iofds, you don't need to >> implement your own lookup logic - you just use the kvm device >> and that's it. >> >> > > Yup. >
On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote: > >> It will also need to support > >> multiple matches. > >> > > > > What, signal many fds on the same address/value pair? > > I see this as a bug. Why is this a good thing to support? > > Just increases the chance of leaking this fd. > > > > I believe Avi asked for this feature specifically, so I will defer to him. Hmm. That's hard to implement in my model. Avi, can we give up this feature? I don't think anyone needs this specifically ...
On Tue, Jun 23, 2009 at 07:41:12AM -0400, Gregory Haskins wrote: > Avi Kivity wrote: > > On 06/22/2009 07:08 PM, Michael S. Tsirkin wrote: > >> On Mon, Jun 22, 2009 at 11:45:00AM -0400, Gregory Haskins wrote: > >> > >>> Michael S. Tsirkin wrote: > >>> > >>>> It seems that a lot of complexity and trickiness with iosignalfd is > >>>> handling the group/item relationship, which comes about because kvm > >>>> does > >>>> not currently let a device on the bus claim a write transaction > >>>> based on the > >>>> value written. This could be greatly simplified if the value written > >>>> was passed to the in_range check for write operation. We could then > >>>> simply make each kvm_iosignalfd a device on the bus. > >>>> > >>>> What does everyone think of the following lightly tested patch? > >>>> > >>>> > >>> Hi Michael, > >>> Its interesting, but I am not convinced its necessary. We > >>> created the > >>> group/item layout because iosignalfds are unique in that they are > >>> probably the only IO device that wants to do some kind of address > >>> aliasing. > >>> > >> > >> We actually already have aliasing: is_write flag is used for this > >> purpose. Actually, it's possible to remove is_write by passing > >> a null pointer in write_val for reads. I like this a bit less as > >> the code generated is less compact ... Avi, what do you think? > >> > > > > Greg, won't Michael's patch eliminate a big chunk from your iosignalfd > > patches? Seems like a win to me. > > Well, it really just moves that hunk from eventfd.c to kvm_main.c, where > I don't think anyone else will use it by iosignalfd. But if that is > what everyone wants, I guess I have no choice. Wait a bit before you start rebasing though please. I just had a brainwave and is rewriting this patch. > > > >> One is enough :) > >> Seriously, do you see that this saves you all of RCU, linked lists and > >> counters? You don't need to keep track of iofds, you don't need to > >> implement your own lookup logic - you just use the kvm device > >> and that's it. > >> > >> > > > > Yup. > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: > On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote: > >>>> It will also need to support >>>> multiple matches. >>>> >>>> >>> What, signal many fds on the same address/value pair? >>> I see this as a bug. Why is this a good thing to support? >>> Just increases the chance of leaking this fd. >>> >>> >> I believe Avi asked for this feature specifically, so I will defer to him. >> > > Hmm. That's hard to implement in my model. Avi, can we give up > this feature? I don't think anyone needs this specifically ... > > If we can drop this it will simplify the mods I will need to do to io_bus to support Michael's new interface. Out of curiosity, what is it you are trying to build Michael? -Greg
On 06/23/2009 07:04 AM, Gregory Haskins wrote: > Well, for one its not very clear what the benefit of the read/write > aliasing even is. ;) Apparently coalesced_mmio uses it, but even so I > doubt that is for the purposes of having one device do reads while > another does writes. I could be wrong, though. > Coalesced mmio cannot handle reads.
On 06/23/2009 02:44 PM, Michael S. Tsirkin wrote: > On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote: > >>>> It will also need to support >>>> multiple matches. >>>> >>>> >>> What, signal many fds on the same address/value pair? >>> I see this as a bug. Why is this a good thing to support? >>> Just increases the chance of leaking this fd. >>> >>> >> I believe Avi asked for this feature specifically, so I will defer to him. >> > > Hmm. That's hard to implement in my model. Avi, can we give up > this feature? I don't think anyone needs this specifically ... > I think we can make do with passing that single eventfd to multiple consumers. It means their event count reads may return zero, but I guess we can live with that. I do want to retain flexibility in how we route events.
Avi Kivity wrote: > On 06/23/2009 02:44 PM, Michael S. Tsirkin wrote: >> On Tue, Jun 23, 2009 at 12:04:06AM -0400, Gregory Haskins wrote: >> >>>>> It will also need to support >>>>> multiple matches. >>>>> >>>>> >>>> What, signal many fds on the same address/value pair? >>>> I see this as a bug. Why is this a good thing to support? >>>> Just increases the chance of leaking this fd. >>>> >>>> >>> I believe Avi asked for this feature specifically, so I will defer >>> to him. >>> >> >> Hmm. That's hard to implement in my model. Avi, can we give up >> this feature? I don't think anyone needs this specifically ... >> > > I think we can make do with passing that single eventfd to multiple > consumers. It means their event count reads may return zero, but I > guess we can live with that. > > I do want to retain flexibility in how we route events. > Ok, so for now I will just crank up the io_bus array, and we can address scale another day. Can I just drop patch 2/3 and let the io_bus govern the limit? -Greg
On 06/23/2009 03:01 PM, Gregory Haskins wrote: > Ok, so for now I will just crank up the io_bus array, and we can address > scale another day. Can I just drop patch 2/3 and let the io_bus govern > the limit? > So long as we have a runtime-discoverable limit, yes.
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 55e8846..98a25af 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -28,7 +28,7 @@ struct kvm_io_device { int len, const void *val); int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, - int is_write); + int is_write, void *write_val); void (*destructor)(struct kvm_io_device *this); void *private; @@ -51,9 +51,10 @@ static inline void kvm_iodevice_write(struct kvm_io_device *dev, } static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, - gpa_t addr, int len, int is_write) + gpa_t addr, int len, int is_write, + void *write_val) { - return dev->in_range(dev, addr, len, is_write); + return dev->in_range(dev, addr, len, is_write, write_val); } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 80c57b0..8cfdf9d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -211,11 +211,13 @@ int kvm_dev_ioctl_check_extension(long ext) } static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, int is_write) + gpa_t addr, int len, int is_write, + void *write_val) { struct kvm_io_device *dev; - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write); + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write, + write_val); return dev; } @@ -247,7 +247,8 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; mmio: - mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); + mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, + !p->dir, &p->data); if (mmio_dev) { if (!p->dir) kvm_iodevice_write(mmio_dev, p->addr, p->size, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d2..5ba21ff 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -485,7 +485,7 @@ static void pit_ioport_read(struct kvm_io_device *this, } static int pit_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) + int len, int is_write, void *write_val) { return ((addr >= KVM_PIT_BASE_ADDRESS) && (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83..456fd53 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -721,7 +721,7 @@ static void apic_mmio_write(struct kvm_io_device *this, } static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, - int len, int size) + int len, int is_write, void *write_val) { struct kvm_lapic *apic = (struct kvm_lapic *)this->private; int ret = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 249540f..9d29017 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2043,13 +2043,13 @@ static void kvm_init_msr_list(void) */ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, gpa_t addr, int len, - int is_write) + int is_write, void *write_val) { struct kvm_io_device *dev; if (vcpu->arch.apic) { dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr, len, is_write)) + if (dev->in_range(dev, addr, len, is_write, write_val)) return dev; } return NULL; @@ -2058,14 +2058,14 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, gpa_t addr, int len, - int is_write) + int is_write, void *write_val) { struct kvm_io_device *dev; - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); + dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write, write_val); if (dev == NULL) dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, - is_write); + is_write, write_val); return dev; } @@ -2161,7 +2161,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0, NULL); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -2216,7 +2216,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1, val); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -2576,9 +2576,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev, static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, gpa_t addr, int len, - int is_write) + int is_write, void *write_val) { - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write, + write_val); } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, @@ -2608,7 +2608,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); + pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in, + vcpu->arch.pio_data); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); complete_pio(vcpu); @@ -2624,7 +2625,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, { unsigned now, in_page; int ret = 0; - struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2672,9 +2672,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_gva = address; - pio_dev = vcpu_find_pio_dev(vcpu, port, - vcpu->arch.pio.cur_count, - !vcpu->arch.pio.in); if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); @@ -2682,13 +2679,20 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_inject_gp(vcpu, 0); return 1; } - if (ret == 0 && pio_dev) { - pio_string_write(pio_dev, vcpu); - complete_pio(vcpu); - if (vcpu->arch.pio.count == 0) - ret = 1; + if (ret == 0) { + struct kvm_io_device *pio_dev; + pio_dev = vcpu_find_pio_dev(vcpu, port, + vcpu->arch.pio.cur_count, + 1, vcpu->arch.pio_data); + if (pio_dev) { + pio_string_write(pio_dev, vcpu); + complete_pio(vcpu); + if (vcpu->arch.pio.count == 0) + ret = 1; + } } - } else if (pio_dev) + } else if (vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, 0, + NULL)) pr_unimpl(vcpu, "no string pio read support yet, " "port %x size %d count %ld\n", port, size, count); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index aacc544..0fb7938 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -60,7 +60,8 @@ struct kvm_io_bus { void kvm_io_bus_init(struct kvm_io_bus *bus); void kvm_io_bus_destroy(struct kvm_io_bus *bus); struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write); + gpa_t addr, int len, int is_write, + void *write_val); void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev); diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ae620d..3a1cbfd 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -15,7 +15,8 @@ #include "coalesced_mmio.h" static int coalesced_mmio_in_range(struct kvm_io_device *this, - gpa_t addr, int len, int is_write) + gpa_t addr, int len, int is_write, + void *write_val) { struct kvm_coalesced_mmio_dev *dev = (struct kvm_coalesced_mmio_dev*)this->private; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1eddae9..2adbb1b 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -221,7 +221,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) } static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) + int len, int is_write, + void *write_val) { struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7645543..b97e390 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2445,14 +2445,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) } struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write) + gpa_t addr, int len, int is_write, + void *write_val) { int i; for (i = 0; i < bus->dev_count; i++) { struct kvm_io_device *pos = bus->devs[i]; - if (pos->in_range(pos, addr, len, is_write)) + if (pos->in_range(pos, addr, len, is_write, write_val)) return pos; }
It seems that a lot of complexity and trickiness with iosignalfd is handling the group/item relationship, which comes about because kvm does not currently let a device on the bus claim a write transaction based on the value written. This could be greatly simplified if the value written was passed to the in_range check for write operation. We could then simply make each kvm_iosignalfd a device on the bus. What does everyone think of the following lightly tested patch? --> Subject: kvm: pass value to in_range callback For write transactions, pass the value written to in_range checks so that we can make each iosignalfd a separate device on kvm bus. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html